Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
63c1c25
feat!: Add multi-dataset query support (resolves #1933).
junhaoliao Feb 14, 2026
f8c63ac
lint
junhaoliao Feb 14, 2026
64e814e
docs: Add `max_datasets_per_query` option to config templates
junhaoliao Feb 14, 2026
ad3db27
docs: Fix docstring formatting in `validate_datasets_exist` function
junhaoliao Feb 14, 2026
0354382
docs: Update documentation for multi-dataset query support
junhaoliao Feb 14, 2026
ad4f505
refactor: Simplify dataset queries by consolidating logic with UNION ALL
junhaoliao Feb 14, 2026
de7f7e3
refactor: Remove unnecessary type assertions in SearchState default v…
junhaoliao Feb 14, 2026
096371e
refactor: Rename selectDatasets to selectedDatasets for consistency; …
junhaoliao Feb 14, 2026
89384a2
refactor: Simplify fallback dataset selection logic using `useCallbac…
junhaoliao Feb 14, 2026
669641f
style: Adjust `selectContainerExpanded` width to improve UI layout co…
junhaoliao Feb 15, 2026
8451a51
refactor: Handle multi-dataset configurations consistently in StreamF…
junhaoliao Feb 15, 2026
f2ed336
lint
junhaoliao Feb 17, 2026
07d2b17
Merge branch 'main' into multi-dataset
junhaoliao Feb 17, 2026
f6b7a22
feat: Add `MaxDatasetsPerQuery` config to limit datasets per query an…
junhaoliao Feb 18, 2026
406c9f8
refactor: Replace `--datasets` with `--dataset` for improved consiste…
junhaoliao Feb 18, 2026
ba934b3
consolidate missing dataset checks
junhaoliao Feb 18, 2026
8a871f8
refactor: Standardize dataset null checks across modules and improve …
junhaoliao Feb 18, 2026
0602b19
Merge remote-tracking branch 'origin/main' into multi-dataset
junhaoliao Feb 18, 2026
1f1d177
refactor: Update dataset handling to use singular dataset attribute f…
junhaoliao Feb 19, 2026
985f86a
refactor: Simplify job handling by extracting search and extraction l…
junhaoliao Feb 19, 2026
48639de
lint
junhaoliao Feb 19, 2026
5831cc5
fix: Correct dataset fallback condition in DatasetSelect component
junhaoliao Feb 19, 2026
5cc96cc
Merge branch 'main' into multi-dataset
junhaoliao Feb 24, 2026
c419a4b
refactor: Replace f-strings with more structured logging in search an…
junhaoliao Feb 24, 2026
b601809
avoid f-string in ValueError() constructor
junhaoliao Feb 24, 2026
e9fe3f2
fix: Adjust missing dataset checks to explicitly verify non-empty sets
junhaoliao Feb 24, 2026
afa2b5a
fix: Remove redundant dataset assignment in extract_stream_task.py
junhaoliao Feb 24, 2026
1941e2f
refactor: Use uniform brace initialization in OutputHandlerImpl
junhaoliao Feb 24, 2026
3aef903
refactor: Change cDataset to std::string_view for improved performance
junhaoliao Feb 24, 2026
fecf5c4
docs: Add detailed docstrings for search job validation and stream ex…
junhaoliao Feb 24, 2026
db59832
fix: Add default dataset fallback in query scheduler when datasets ar…
junhaoliao Feb 24, 2026
d9261af
fix: Handle archives retrieval for searches without datasets in query…
junhaoliao Feb 26, 2026
e37971c
fix: Correct fallback handling for empty datasets in search API
junhaoliao Feb 26, 2026
6312a53
Merge branch 'main' into multi-dataset
junhaoliao Feb 26, 2026
51e86ef
Merge branch 'main' into multi-dataset
junhaoliao Mar 1, 2026
52beb34
refactor: Add support for single and multi-select modes in DatasetSel…
junhaoliao Mar 1, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions components/api-server/src/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@ pub struct QueryConfig {
/// The search query as a KQL string.
pub query_string: String,

/// The dataset to search within. If not provided, only `default` dataset will be searched.
/// The datasets to search within. If not provided, only `default` dataset will be searched.
#[serde(default)]
pub dataset: Option<String>,
pub datasets: Option<Vec<String>>,

/// The maximum number of results to return. Set to `0` for no limit.
#[serde(default)]
Expand Down Expand Up @@ -58,7 +58,7 @@ pub struct QueryConfig {
impl From<QueryConfig> for SearchJobConfig {
fn from(value: QueryConfig) -> Self {
Self {
dataset: value.dataset,
datasets: value.datasets,
query_string: value.query_string,
max_num_results: value.max_num_results,
begin_timestamp: value.time_range_begin_millisecs,
Expand Down Expand Up @@ -128,10 +128,10 @@ impl Client {
/// * Forwards [`sqlx::query::Query::execute`]'s return values on failure.
pub async fn submit_query(&self, query_config: QueryConfig) -> Result<u64, ClientError> {
let mut search_job_config: SearchJobConfig = query_config.into();
if search_job_config.dataset.is_none() {
search_job_config.dataset = match self.config.package.storage_engine {
if search_job_config.datasets.is_none() {
search_job_config.datasets = match self.config.package.storage_engine {
StorageEngine::Clp => None,
StorageEngine::ClpS => Some("default".to_owned()),
StorageEngine::ClpS => Some(vec!["default".to_owned()]),
}
}
if search_job_config.max_num_results == 0 {
Expand Down
2 changes: 1 addition & 1 deletion components/api-server/src/routes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ async fn health() -> String {
content= QueryConfig,
example = json!({
"query_string": "*",
"dataset": "default",
"datasets": ["default"],
"time_range_begin_millisecs": 0,
"time_range_end_millisecs": 17_356_896,
"ignore_case": true,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from clp_package_utils.scripts.native.utils import (
run_function_in_process,
submit_query_job,
validate_dataset_exists,
validate_datasets_exist,
wait_for_query_job,
)

Expand All @@ -37,7 +37,7 @@
def create_and_monitor_job_in_db(
db_config: Database,
results_cache: ResultsCache,
dataset: str | None,
datasets: list[str] | None,
wildcard_query: str,
begin_timestamp: int | None,
end_timestamp: int | None,
Expand All @@ -48,7 +48,7 @@ def create_and_monitor_job_in_db(
count_by_time_bucket_size: int | None,
):
search_config = SearchJobConfig(
dataset=dataset,
datasets=datasets,
query_string=wildcard_query,
begin_timestamp=begin_timestamp,
end_timestamp=end_timestamp,
Expand Down Expand Up @@ -115,7 +115,7 @@ async def worker_connection_handler(reader: asyncio.StreamReader, writer: asynci
async def do_search_without_aggregation(
db_config: Database,
results_cache: ResultsCache,
dataset: str | None,
datasets: list[str] | None,
wildcard_query: str,
begin_timestamp: int | None,
end_timestamp: int | None,
Expand Down Expand Up @@ -144,7 +144,7 @@ async def do_search_without_aggregation(
create_and_monitor_job_in_db,
db_config,
results_cache,
dataset,
datasets,
wildcard_query,
begin_timestamp,
end_timestamp,
Expand Down Expand Up @@ -181,7 +181,7 @@ async def do_search_without_aggregation(
async def do_search(
db_config: Database,
results_cache: ResultsCache,
dataset: str | None,
datasets: list[str] | None,
wildcard_query: str,
begin_timestamp: int | None,
end_timestamp: int | None,
Expand All @@ -195,7 +195,7 @@ async def do_search(
await do_search_without_aggregation(
db_config,
results_cache,
dataset,
datasets,
wildcard_query,
begin_timestamp,
end_timestamp,
Expand All @@ -208,7 +208,7 @@ async def do_search(
create_and_monitor_job_in_db,
db_config,
results_cache,
dataset,
datasets,
wildcard_query,
begin_timestamp,
end_timestamp,
Expand All @@ -234,10 +234,11 @@ def main(argv):
)
args_parser.add_argument("wildcard_query", help="Wildcard query.")
args_parser.add_argument(
"--dataset",
"--datasets",
type=str,
nargs="+",
default=None,
help="The dataset that the archives belong to.",
help="The datasets that the archives belong to.",
)
args_parser.add_argument(
"--begin-time",
Expand Down Expand Up @@ -297,10 +298,10 @@ def main(argv):
return -1

database_config: Database = clp_config.database
dataset = parsed_args.dataset
if dataset is not None:
datasets = parsed_args.datasets
if datasets is not None:
try:
validate_dataset_exists(database_config, dataset)
validate_datasets_exist(database_config, datasets)
except Exception as e:
logger.error(e)
return -1
Expand All @@ -310,7 +311,7 @@ def main(argv):
do_search(
database_config,
clp_config.results_cache,
dataset,
datasets,
parsed_args.wildcard_query,
parsed_args.begin_time,
parsed_args.end_time,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,13 +72,13 @@ def submit_query_job(
return db_cursor.lastrowid


def validate_dataset_exists(db_config: Database, dataset: str) -> None:
def validate_datasets_exist(db_config: Database, datasets: list[str]) -> None:
"""
Validates that `dataset` exists in the metadata database.
Validates that all datasets in `datasets` exist in the metadata database.

:param db_config:
:param dataset:
:raise: ValueError if the dataset doesn't exist.
:param datasets:
:raise: ValueError if any dataset doesn't exist.
"""
sql_adapter = SqlAdapter(db_config)
clp_db_connection_params = db_config.get_clp_connection_params_and_type(True)
Expand All @@ -87,8 +87,10 @@ def validate_dataset_exists(db_config: Database, dataset: str) -> None:
closing(sql_adapter.create_connection(True)) as db_conn,
closing(db_conn.cursor(dictionary=True)) as db_cursor,
):
if dataset not in fetch_existing_datasets(db_cursor, table_prefix):
raise ValueError(f"Dataset `{dataset}` doesn't exist.")
existing_datasets = fetch_existing_datasets(db_cursor, table_prefix)
for dataset in datasets:
if dataset not in existing_datasets:
raise ValueError(f"Dataset `{dataset}` doesn't exist.")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shall we report all datasets that do not exist in the metadata database?



def wait_for_query_job(sql_adapter: SqlAdapter, job_id: int) -> QueryJobStatus:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,11 @@ def main(argv):
)
args_parser.add_argument("wildcard_query", help="Wildcard query.")
args_parser.add_argument(
"--dataset",
"--datasets",
type=str,
nargs="+",
default=None,
help="The dataset that the archives belong to.",
help="The datasets that the archives belong to.",
)
args_parser.add_argument(
"--begin-time",
Expand Down Expand Up @@ -113,16 +114,17 @@ def main(argv):
)
return -1

dataset = parsed_args.dataset
datasets = parsed_args.datasets
if StorageEngine.CLP_S == storage_engine:
dataset = CLP_DEFAULT_DATASET_NAME if dataset is None else dataset
datasets = [CLP_DEFAULT_DATASET_NAME] if datasets is None else datasets
try:
clp_db_connection_params = clp_config.database.get_clp_connection_params_and_type(True)
validate_dataset_name(clp_db_connection_params["table_prefix"], dataset)
for ds in datasets:
validate_dataset_name(clp_db_connection_params["table_prefix"], ds)
except Exception as e:
logger.error(e)
return -1
elif dataset is not None:
elif datasets is not None:
logger.error(f"Dataset selection is not supported for storage engine: {storage_engine}.")
return -1

Expand Down Expand Up @@ -152,9 +154,9 @@ def main(argv):
# fmt: on
if parsed_args.verbose:
search_cmd.append("--verbose")
if dataset is not None:
search_cmd.append("--dataset")
search_cmd.append(dataset)
if datasets is not None:
search_cmd.append("--datasets")
search_cmd.extend(datasets)
if parsed_args.begin_time is not None:
search_cmd.append("--begin-time")
search_cmd.append(str(parsed_args.begin_time))
Expand Down
1 change: 1 addition & 0 deletions components/clp-py-utils/clp_py_utils/clp_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,6 +432,7 @@ class QueryScheduler(BaseModel):
host: DomainStr = "localhost"
port: Port = DEFAULT_PORT
jobs_poll_delay: PositiveFloat = 0.1 # seconds
max_datasets_per_query: PositiveInt | None = 10
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Nitpick | 🔵 Trivial

Add an inline comment documenting the None semantics.

Users editing config files need to know that setting this field to null means "unlimited." A brief comment would help:

📝 Suggested documentation
-    max_datasets_per_query: PositiveInt | None = 10
+    max_datasets_per_query: PositiveInt | None = 10  # None means unlimited
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
max_datasets_per_query: PositiveInt | None = 10
max_datasets_per_query: PositiveInt | None = 10 # None means unlimited
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@components/clp-py-utils/clp_py_utils/clp_config.py` at line 435, The config
field max_datasets_per_query (in clp_config.py) lacks documentation for None
semantics—add a brief inline comment next to its declaration explaining that
setting this field to None (or null in YAML/JSON) means "unlimited" datasets per
query; keep the comment concise and on the same line or immediately above the
declaration so users editing config files immediately see that None/null =
unlimited.

num_archives_to_search_per_sub_job: PositiveInt = 16
logging_level: LoggingLevel = "INFO"

Expand Down
2 changes: 1 addition & 1 deletion components/clp-rust-utils/src/job_config/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ pub const QUERY_JOBS_TABLE_NAME: &str = "query_jobs";
#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)]
#[serde(default)]
pub struct SearchJobConfig {
pub dataset: Option<String>,
pub datasets: Option<Vec<String>>,
pub query_string: String,
pub max_num_results: u32,
pub begin_timestamp: Option<i64>,
Expand Down
4 changes: 4 additions & 0 deletions components/core/src/clp_s/CommandLineArguments.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -605,6 +605,10 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) {
po::value<uint64_t>(&m_max_num_results)->value_name("MAX")->
default_value(m_max_num_results),
"The maximum number of results to output"
)(
"dataset",
po::value<std::string>(&m_dataset)->value_name("DATASET"),
"The dataset name to include in each result document"
);
Comment on lines +608 to 612
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The dataset name needs to be include in the query results before dumping them into the result cache. i know this feels like a hack. a more future proof interface might to to accept a JSON object string --extra-result-metadata and merge such objects with the results before dumping into the results cache. what do you think?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

alternatively, we can add multi-dataset support to the stream extraction flow as well. then we just pass an array of datasets in the stream extraction config, avoiding touching the clp-s binary

i think this may be more aligned with the future plan of using a single table for all datasets (maybe i haven't had a correct understanding of how we are going to refactor the datasets feature though

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

discussed offline - the --dataset CLI flag is fine. we will remove it when we refactor the CLI


po::options_description file_output_handler_options("File Output Handler Options");
Expand Down
3 changes: 3 additions & 0 deletions components/core/src/clp_s/CommandLineArguments.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,8 @@ class CommandLineArguments {

bool get_record_log_order() const { return false == m_disable_log_order; }

std::string const& get_dataset() const { return m_dataset; }

private:
// Methods
/**
Expand Down Expand Up @@ -227,6 +229,7 @@ class CommandLineArguments {
std::optional<epochtime_t> m_search_end_ts;
bool m_ignore_case{false};
std::vector<std::string> m_projection_columns;
std::string m_dataset;

// Search aggregation variables
std::string m_reducer_host;
Expand Down
14 changes: 11 additions & 3 deletions components/core/src/clp_s/OutputHandlerImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,13 @@ ResultsCacheOutputHandler::ResultsCacheOutputHandler(
string const& collection,
uint64_t batch_size,
uint64_t max_num_results,
string dataset,
bool should_output_timestamp
)
: ::clp_s::search::OutputHandler(should_output_timestamp, true),
m_batch_size(batch_size),
m_max_num_results(max_num_results) {
m_max_num_results(max_num_results),
m_dataset(std::move(dataset)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
m_max_num_results(max_num_results),
m_dataset(std::move(dataset)) {
m_max_num_results{max_num_results},
m_dataset{std::move(dataset)} {

Nit: coding guidelines.

try {
auto mongo_uri = mongocxx::uri(uri);
m_client = mongocxx::client(mongo_uri);
Expand Down Expand Up @@ -114,6 +116,10 @@ ErrorCode ResultsCacheOutputHandler::flush() {
bsoncxx::builder::basic::kvp(
constants::results_cache::search::cLogEventIx,
result.log_event_idx
),
bsoncxx::builder::basic::kvp(
constants::results_cache::search::cDataset,
std::move(result.dataset)
)
)
)
Expand Down Expand Up @@ -154,7 +160,8 @@ void ResultsCacheOutputHandler::write(
message,
timestamp,
archive_id,
log_event_idx
log_event_idx,
m_dataset
)
);
} else if (m_latest_results.top()->timestamp < timestamp) {
Expand All @@ -165,7 +172,8 @@ void ResultsCacheOutputHandler::write(
message,
timestamp,
archive_id,
log_event_idx
log_event_idx,
m_dataset
)
);
}
Expand Down
9 changes: 7 additions & 2 deletions components/core/src/clp_s/OutputHandlerImpl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,19 +128,22 @@ class ResultsCacheOutputHandler : public ::clp_s::search::OutputHandler {
std::string_view message,
epochtime_t timestamp,
std::string_view archive_id,
int64_t log_event_idx
int64_t log_event_idx,
std::string_view dataset
)
: original_path(original_path),
message(message),
timestamp(timestamp),
archive_id(archive_id),
log_event_idx(log_event_idx) {}
log_event_idx(log_event_idx),
dataset(dataset) {}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
log_event_idx(log_event_idx),
dataset(dataset) {}
log_event_idx{log_event_idx},
dataset{dataset} {}


std::string original_path;
std::string message;
epochtime_t timestamp;
std::string archive_id;
int64_t log_event_idx;
std::string dataset;
};

struct QueryResultGreaterTimestampComparator {
Expand All @@ -165,6 +168,7 @@ class ResultsCacheOutputHandler : public ::clp_s::search::OutputHandler {
std::string const& collection,
uint64_t batch_size,
uint64_t max_num_results,
std::string dataset,
bool should_output_metadata = true
);

Expand All @@ -191,6 +195,7 @@ class ResultsCacheOutputHandler : public ::clp_s::search::OutputHandler {
std::vector<bsoncxx::document::value> m_results;
uint64_t m_batch_size;
uint64_t m_max_num_results;
std::string m_dataset;
std::priority_queue<
std::unique_ptr<QueryResult>,
std::vector<std::unique_ptr<QueryResult>>,
Expand Down
1 change: 1 addition & 0 deletions components/core/src/clp_s/archive_constants.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ constexpr char cLogEventIx[]{"log_event_ix"};
constexpr char cTimestamp[]{"timestamp"};
constexpr char cMessage[]{"message"};
constexpr char cArchiveId[]{"archive_id"};
constexpr char cDataset[]{"dataset"};
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
constexpr char cDataset[]{"dataset"};
constexpr std::string_view cDataset{"dataset"};

To comply with our current coding guidelines.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should probably also move all of these results cache related constants out of archive_constants.hpp at some point, but not as part of this PR. @coderabbitai can you create an issue to track?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧩 Analysis chain

@gibber9809, I'll create an issue to track moving the results cache constants out of archive_constants.hpp.


🏁 Script executed:

#!/bin/bash
# Create a GitHub issue to track refactoring results cache constants

gh issue create \
  --repo y-scope/clp \
  --title "Refactor: Move results cache constants out of archive_constants.hpp" \
  --body "## Description

The results cache related constants (in \`results_cache::decompression\` and \`results_cache::search\` namespaces) should be moved out of \`components/core/src/clp_s/archive_constants.hpp\` into a more appropriate location, as they are not directly related to archive structure.

## Affected Code

File: \`components/core/src/clp_s/archive_constants.hpp\`

Namespaces to relocate:
- \`clp_s::constants::results_cache::decompression\`
- \`clp_s::constants::results_cache::search\`

## Context

This refactoring was identified during review of PR `#1992`:
- PR: https://github.com/y-scope/clp/pull/1992
- Comment: https://github.com/y-scope/clp/pull/1992#discussion_r2843034948

Requested by: `@gibber9809`" \
  --assignee gibber9809

Length of output: 42


@gibber9809, I've created issue #2016 to track moving the results cache constants (results_cache::decompression and results_cache::search namespaces) out of archive_constants.hpp. The issue has been assigned to you and includes backlinks to this PR and comment for context.


🧠 Learnings used
Learnt from: AVMatthews
Repo: y-scope/clp PR: 543
File: components/core/src/clp_s/JsonParser.cpp:581-627
Timestamp: 2024-10-07T21:38:35.979Z
Learning: In `components/core/src/clp_s/JsonParser.cpp`, within the `get_archive_node_id` method, throwing a string literal as an exception is acceptable practice.

Learnt from: AVMatthews
Repo: y-scope/clp PR: 543
File: components/core/src/clp_s/JsonParser.cpp:581-627
Timestamp: 2024-10-08T15:52:50.753Z
Learning: In `components/core/src/clp_s/JsonParser.cpp`, within the `get_archive_node_id` method, throwing a string literal as an exception is acceptable practice.

Learnt from: gibber9809
Repo: y-scope/clp PR: 1163
File: components/core/src/clp_s/ColumnWriter.hpp:120-122
Timestamp: 2025-08-06T23:57:49.762Z
Learning: In the CLP codebase, the `encoded_log_dict_id_t` type alias in `components/core/src/clp_s/ColumnWriter.hpp` is appropriately placed as a class-scoped alias because the ColumnWriter.hpp header is already included in all places where this type is used, eliminating the need to move it to a common header.

Learnt from: davemarco
Repo: y-scope/clp PR: 700
File: components/core/src/clp/streaming_archive/ArchiveMetadata.hpp:153-155
Timestamp: 2025-01-30T19:26:33.869Z
Learning: When working with constexpr strings (string literals with static storage duration), std::string_view is the preferred choice for member variables as it's more efficient and safe, avoiding unnecessary memory allocations.

Learnt from: haiqi96
Repo: y-scope/clp PR: 646
File: components/core/src/clp/streaming_archive/writer/Archive.hpp:354-354
Timestamp: 2025-01-14T16:06:54.692Z
Learning: Member variables in C++ classes should be explicitly initialized in the constructor to prevent undefined behavior, as demonstrated in the Archive class where `m_use_single_file_archive` is initialized to `false`.

Learnt from: LinZhihao-723
Repo: y-scope/clp PR: 873
File: components/core/src/clp/ffi/ir_stream/search/QueryHandlerImpl.cpp:148-157
Timestamp: 2025-05-02T22:27:59.347Z
Learning: In the `QueryHandlerImpl.cpp` file, the `unique_projected_columns` set (using `std::string_view`) is intentionally designed to only check for duplications within the local scope of the `create_projected_columns_and_projection_map` function. The team decided this is an acceptable use of `std::string_view` in a container since the referenced strings remain valid throughout the function's execution.

} // namespace results_cache::search
} // namespace clp_s::constants
#endif // CLP_S_ARCHIVE_CONSTANTS_HPP
3 changes: 2 additions & 1 deletion components/core/src/clp_s/clp-s.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,8 @@ bool search_archive(
command_line_arguments.get_mongodb_uri(),
command_line_arguments.get_mongodb_collection(),
command_line_arguments.get_batch_size(),
command_line_arguments.get_max_num_results()
command_line_arguments.get_max_num_results(),
command_line_arguments.get_dataset()
);
break;
case CommandLineArguments::OutputHandlerType::Stdout:
Expand Down
Loading
Loading