Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion components/api-server/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,12 @@ futures = "0.3.31"
mongodb = "3.3.0"
num_enum = "0.7.5"
rmp-serde = "1.3.0"
secrecy = "0.10.3"
serde = { version = "1.0.228", features = ["derive"] }
serde_json = "1.0.145"
sqlx = { version = "0.8.6", features = ["runtime-tokio", "mysql"] }
thiserror = "2.0.17"
tokio = { version = "1.48.0", features = ["full"] }
tracing = "0.1.41"
tracing-appender = "0.2.3"
tracing-subscriber = { version = "0.3.20", features = ["json", "env-filter"] }
tracing-subscriber = { version = "0.3.20", features = ["json", "env-filter", "fmt", "std"] }
69 changes: 45 additions & 24 deletions components/api-server/src/bin/api_server.rs
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cc @LinZhihao-723 for awareness

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the main function seems a bit long now. would extracting helpers improve readability? e.g.,

async fn read_config_and_credentials()

fn set_up_logging()

// extracted from with_graceful_shutdown
async fn shutdown_signal()

Original file line number Diff line number Diff line change
Expand Up @@ -16,44 +16,59 @@ use clp_rust_utils::{clp_config::package, serde::yaml};
use futures::{Stream, StreamExt};
use thiserror::Error;
use tracing_appender::rolling::{RollingFileAppender, Rotation};
use tracing_subscriber::{self};
use tracing_subscriber::{self, fmt::writer::Tee};

#[derive(Parser)]
#[command(version, about = "API Server for CLP.")]
struct Args {}
struct Args {
#[arg(long)]
config: String,

#[arg(long)]
host: Option<String>,

#[arg(long)]
port: Option<u16>,
}

#[tokio::main]
async fn main() -> anyhow::Result<()> {
let _ = Args::parse();
let home = std::env::var("CLP_HOME").context("Expect `CLP_HOME` env variable")?;
let home = std::path::Path::new(&home);
let args = Args::parse();

let config_path = home.join(package::DEFAULT_CONFIG_FILE_PATH);
let config: package::config::Config = yaml::from_path(&config_path).context(format!(
let config_path = std::path::Path::new(args.config.as_str());
let config: package::config::Config = yaml::from_path(config_path).context(format!(
"Config file {} does not exist",
config_path.display()
))?;

let file_appender = RollingFileAppender::new(
Rotation::HOURLY,
home.join(&config.logs_directory).join("api_server"),
"api_server.log",
);
let logs_directory =
std::env::var("CLP_LOGS_DIR").context("Expect `CLP_LOGS_DIR` environment variable.")?;
let logs_directory = std::path::Path::new(logs_directory.as_str());
let file_appender =
RollingFileAppender::new(Rotation::HOURLY, logs_directory, "api_server.log");
let (non_blocking_writer, _guard) = tracing_appender::non_blocking(file_appender);
tracing_subscriber::fmt()
.with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
.with_ansi(false)
.with_writer(non_blocking_writer)
.with_writer(Tee::new(std::io::stdout, non_blocking_writer))
.init();

let credentials_path = home.join(package::DEFAULT_CREDENTIALS_FILE_PATH);
let credentials: package::credentials::Credentials = yaml::from_path(&credentials_path)
.context(format!(
"Credentials file {} does not exist",
credentials_path.display()
))?;

let addr = format!("{}:{}", &config.api_server.host, &config.api_server.port);
let credentials = package::credentials::Credentials {
database: package::credentials::Database {
password: secrecy::SecretString::new(
std::env::var("CLP_DB_PASS")
.context("Expect `CLP_DB_PASS` env variable")?
.into_boxed_str(),
),
user: std::env::var("CLP_DB_USER").context("Expect `CLP_DB_USER` env variable")?,
},
};

let addr = format!(
"{}:{}",
args.host.unwrap_or_else(|| config.api_server.host.clone()),
args.port.unwrap_or(config.api_server.port)
);
let listener = tokio::net::TcpListener::bind(&addr)
.await
.context(format!("Cannot listen to {addr}"))?;
Expand All @@ -72,9 +87,15 @@ async fn main() -> anyhow::Result<()> {
tracing::info!("Server started at {addr}");
axum::serve(listener, app)
.with_graceful_shutdown(async {
tokio::signal::ctrl_c()
.await
.expect("failed to listen for event");
let mut sigterm =
tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate())
.expect("failed to listen for SIGTERM");
tokio::select! {
_ = sigterm.recv() => {
}
_ = tokio::signal::ctrl_c()=> {
}
}
})
.await?;
Ok(())
Expand Down
26 changes: 26 additions & 0 deletions components/clp-package-utils/clp_package_utils/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from typing import Any, Optional

from clp_py_utils.clp_config import (
API_SERVER_COMPONENT_NAME,
AwsAuthType,
CLPConfig,
COMPRESSION_JOBS_TABLE_NAME,
Expand Down Expand Up @@ -619,6 +620,30 @@ def _set_up_env_for_garbage_collector(self) -> EnvVarsDict:

return env_vars

def _set_up_env_for_api_server(self) -> EnvVarsDict:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry for not starting the discussion before you implement -

since both the MCP server and the garbage collector are optionally launched, we previously list them after the "always" launched services. (that - skipping the query services when Presto is enabled - is tricky but another story)

can we list the api-server before the webui? ditto in other files

"""
Sets up environment variables and directories for the API server component.
:return: Dictionary of environment variables necessary to launch the component.
"""
component_name = API_SERVER_COMPONENT_NAME

logger.info(f"Setting up environment for {component_name}...")

logs_dir = self._clp_config.logs_directory / component_name
resolved_logs_dir = resolve_host_path_in_container(logs_dir)
resolved_logs_dir.mkdir(parents=True, exist_ok=True)

env_vars = EnvVarsDict()

# Connection config
env_vars |= {
"CLP_API_SERVER_HOST": _get_ip_from_hostname(self._clp_config.api_server.host),
"CLP_API_SERVER_PORT": str(self._clp_config.api_server.port),
}

return env_vars

def _read_and_update_settings_json(
self, settings_file_path: pathlib.Path, updates: dict[str, Any]
) -> dict[str, Any]:
Expand Down Expand Up @@ -747,6 +772,7 @@ def set_up_env(self) -> None:
env_vars |= self._set_up_env_for_webui(container_clp_config)
env_vars |= self._set_up_env_for_mcp_server()
env_vars |= self._set_up_env_for_garbage_collector()
env_vars |= self._set_up_env_for_api_server()

# Write the environment variables to the `.env` file.
with open(f"{self._clp_home}/.env", "w") as env_file:
Expand Down
15 changes: 15 additions & 0 deletions components/clp-py-utils/clp_py_utils/clp_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
WEBUI_COMPONENT_NAME = "webui"
MCP_SERVER_COMPONENT_NAME = "mcp_server"
GARBAGE_COLLECTOR_COMPONENT_NAME = "garbage_collector"
API_SERVER_COMPONENT_NAME = "api_server"

# Action names
ARCHIVE_MANAGER_ACTION_NAME = "archive_manager"
Expand Down Expand Up @@ -592,6 +593,19 @@ class GarbageCollector(BaseModel):
sweep_interval: SweepInterval = SweepInterval()


class QueryJobPollingConfig(BaseModel):
initial_backoff_ms: int = Field(default=100, alias="initial_backoff")

max_backoff_ms: int = Field(default=5000, alias="max_backoff")


class ApiServer(BaseModel):
host: str = "localhost"
port: int = 3001
Comment on lines +603 to +604
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
host: str = "localhost"
port: int = 3001
host: DomainStr = "localhost"
port: Port = 3001

query_job_polling: QueryJobPollingConfig = QueryJobPollingConfig()
default_max_num_query_results: int = 1000

Comment on lines +602 to +607
Copy link
Contributor

@coderabbitai coderabbitai bot Nov 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion | 🟠 Major

Consider adding consistency features to align with other component classes.

The ApiServer class is missing several patterns used by similar component classes (e.g., Database, Queue, Redis):

  1. Type safety: host should use DomainStr and port should use Port for validation
  2. Class constant: Missing DEFAULT_PORT ClassVar for clarity
  3. Container transformation: Missing transform_for_container() method

Additionally, CLPConfig.transform_for_container() (lines 847-869) does not transform the api_server field, unlike other components.

Apply this diff to add consistency:

 class ApiServer(BaseModel):
-    host: str = "localhost"
-    port: int = 3001
+    DEFAULT_PORT: ClassVar[int] = 3001
+
+    host: DomainStr = "localhost"
+    port: Port = DEFAULT_PORT
     query_job_polling: QueryJobPollingConfig = QueryJobPollingConfig()
     default_max_num_query_results: int = 1000
+
+    def transform_for_container(self):
+        self.host = API_SERVER_COMPONENT_NAME
+        self.port = self.DEFAULT_PORT

Then add this line in CLPConfig.transform_for_container():

        self.reducer.transform_for_container()
+       # Note: API server doesn't need container transformation as it's not accessed from other containers
🤖 Prompt for AI Agents
In components/clp-py-utils/clp_py_utils/clp_config.py around lines 602-607, the
ApiServer model is inconsistent with other component classes: change host type
to DomainStr and port type to Port, add a ClassVar DEFAULT_PORT for clarity,
implement a transform_for_container(self) method mirroring the pattern used by
Database/Queue/Redis to perform any container-specific adjustments (even if it's
a no-op, include the method for consistency), and update
CLPConfig.transform_for_container() (around lines 847-869) to include the
provided comment line after self.reducer.transform_for_container() indicating
the API server does not require container transformation. Ensure types are
imported if missing and follow the same method signature and behavior pattern
used by other component classes.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i dont think the transform_for_container() is needed because no other containerized service depends on the api-server at the moment - no docker-network communication is expected

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@junhaoliao, that's a valid point! If the API server is only accessed from the host and not by other containerized services, then transform_for_container() is indeed unnecessary.

Would you still like to apply the other consistency improvements for type safety and clarity?

  • Using DomainStr for the host field (provides validation)
  • Using Port for the port field (validates port is between 1-65535)
  • Adding DEFAULT_PORT: ClassVar[int] = 3001 for consistency with other components

These would make the ApiServer model more consistent with similar component classes in the file, even without the container transformation. Let me know your preference!


✏️ Learnings added
Learnt from: junhaoliao
Repo: y-scope/clp PR: 1575
File: components/clp-py-utils/clp_py_utils/clp_config.py:602-607
Timestamp: 2025-11-10T05:19:56.582Z
Learning: In the y-scope/clp repository, the `ApiServer` class in `components/clp-py-utils/clp_py_utils/clp_config.py` does not need a `transform_for_container()` method because no other containerized service depends on the API server - it's only accessed from the host, so no docker-network communication is expected.

Note: Learnings are effective only in the context of similar code segments. To apply general coding guidelines, please configure review instructions. You can manage existing learnings in the app.


🧠 Learnings used
Learnt from: hoophalab
Repo: y-scope/clp PR: 1535
File: components/clp-rust-utils/src/clp_config/package/config.rs:47-61
Timestamp: 2025-11-03T16:17:40.223Z
Learning: In the y-scope/clp repository, the `ApiServer` struct in `components/clp-rust-utils/src/clp_config/package/config.rs` is a Rust-native configuration type and does not mirror any Python code, unlike other structs in the same file (Config, Database, ResultsCache, Package) which are mirrors of Python definitions.


class Presto(BaseModel):
DEFAULT_PORT: ClassVar[int] = 8080

Expand Down Expand Up @@ -627,6 +641,7 @@ class CLPConfig(BaseModel):
query_worker: QueryWorker = QueryWorker()
webui: WebUi = WebUi()
garbage_collector: GarbageCollector = GarbageCollector()
api_server: ApiServer = ApiServer()
credentials_file_path: SerializablePath = CLP_DEFAULT_CREDENTIALS_FILE_PATH

mcp_server: Optional[McpServer] = None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ impl Default for ApiServer {
}

#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
#[serde(default)]
pub struct QueryJobPollingConfig {
#[serde(rename = "initial_backoff")]
pub initial_backoff_ms: u64,
Expand Down
9 changes: 9 additions & 0 deletions components/package-template/src/etc/clp-config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,15 @@
# archive: 60
# search_result: 30
#
## API server config
#api_server:
# host: "localhost"
# port: 3001
# default_max_num_query_results: 1000
# query_job_polling:
# initial_backof_ms: 100
# max_backoff_ms: 5000
#
## Presto client config
#presto: null
#
Expand Down
14 changes: 14 additions & 0 deletions taskfile.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ vars:
G_PYTHON_LIBS_DIR: "{{.G_BUILD_DIR}}/python-libs"
G_WEBUI_BUILD_DIR: "{{.G_BUILD_DIR}}/webui"
G_SPIDER_BUILD_DIR: "{{.G_BUILD_DIR}}/spider"
G_RUST_BUILD_DIR: "{{.G_BUILD_DIR}}/rust"
Comment on lines 32 to +33
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these two seem not alphabetized


# Taskfile paths
G_UTILS_TASKFILE: "{{.ROOT_DIR}}/tools/yscope-dev-utils/exports/taskfiles/utils/utils.yaml"
Expand Down Expand Up @@ -73,6 +74,7 @@ tasks:
vars:
COMPONENT: "job-orchestration"
- task: "clean-webui"
- task: "clean-rust"
Comment on lines 76 to +77
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
- task: "clean-webui"
- task: "clean-rust"
- task: "clean-rust"
- task: "clean-webui"

- task: "tests:integration:cache-clear"

clean-core:
Expand Down Expand Up @@ -103,6 +105,10 @@ tasks:
- "rm -rf '{{.G_WEBUI_SRC_DIR}}/server/node_modules'"
- "rm -rf '{{.G_WEBUI_SRC_DIR}}/yscope-log-viewer/node_modules'"

clean-rust:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i don't remember why we placed the clp-json-pkg-tar and clp-text-pkg-tar in between the "clean" tasks though likely we shouldn't. still let's place this directly above clean-webui (to also match the proposed order in the clean task), so the diff may look nicer when we feel itchy to clean up the order of the pkg tasks

cmds:
- "rm -rf '{{.G_RUST_BUILD_DIR}}'"

package:
vars:
CHECKSUM_FILE: "{{.G_PACKAGE_CHECKSUM_FILE}}"
Expand Down Expand Up @@ -191,6 +197,13 @@ tasks:
JOBS: "{{.G_CPP_MAX_PARALLELISM_PER_BUILD_TASK}}"
TARGETS: ["clg", "clo", "clp", "clp-s", "indexer", "log-converter", "reducer-server"]

rust:
deps: ["toolchains:rust"]
dir: "{{.ROOT_DIR}}"
cmd: |-
. "$HOME/.cargo/env"
cargo build --release --bins --target-dir {{.G_RUST_BUILD_DIR}}
Copy link
Contributor Author

@hoophalab hoophalab Nov 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cargo does an excellent job in detecting changes in both sources and output binary. We don't need to do checksum here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shall we also document this inline for future developers?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need to

. "$HOME/.cargo/env"

to make sure the cargo binary is available in the build environment?


clp-mcp-server:
- task: "uv-component"
vars:
Expand Down Expand Up @@ -218,6 +231,7 @@ tasks:
- "init"
- "python-libs"
- "webui"
- "rust"
Comment on lines 235 to +238
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
- "init"
- "python-libs"
- "webui"
- "rust"
- "init"
- "python-libs"
- "rust"
- "webui"


webui:
env:
Expand Down
27 changes: 27 additions & 0 deletions tools/deployment/package/docker-compose-all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -486,3 +486,30 @@ services:
"-f",
"http://mcp_server:8000/health"
]

api-server:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's also update user-docs/guides-multi-host and dev-docs/design-deployment-orchestration

<<: *service_defaults
hostname: "api_server"
environment:
CLP_LOGS_DIR: "/var/log/api_server"
CLP_DB_PASS: "${CLP_DB_PASS:?Please set a value.}"
CLP_DB_USER: "${CLP_DB_USER:?Please set a value.}"
RUST_LOG: "TRACE"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would info or debug be more appropriate?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ports:
- host_ip: "${CLP_API_SERVER_HOST:-127.0.0.1}"
published: "${CLP_API_SERVER_PORT:-3001}"
target: 3001
volumes:
- *volume_clp_config_readonly
- *volume_clp_logs
depends_on:
db-table-creator:
condition: "service_completed_successfully"
results-cache-indices-creator:
condition: "service_completed_successfully"
command: [
"/opt/clp/bin/api_server",
"--host", "0.0.0.0",
"--port", "3001",
"--config", "/etc/clp-config.yml"
]
Comment on lines +490 to +515
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Nitpick | 🔵 Trivial

Consider making RUST_LOG level configurable.

The API server service configuration looks well-structured and follows the established patterns. However, RUST_LOG is hardcoded to "TRACE", which produces very verbose logs.

Consider making this configurable like other services:

     environment:
       CLP_LOGS_DIR: "/var/log/api_server"
       CLP_DB_PASS: "${CLP_DB_PASS:?Please set a value.}"
       CLP_DB_USER: "${CLP_DB_USER:?Please set a value.}"
-      RUST_LOG: "TRACE"
+      RUST_LOG: "${CLP_API_SERVER_LOGGING_LEVEL:-INFO}"
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
api-server:
<<: *service_defaults
hostname: "api_server"
environment:
CLP_LOGS_DIR: "/var/log/api_server"
CLP_DB_PASS: "${CLP_DB_PASS:?Please set a value.}"
CLP_DB_USER: "${CLP_DB_USER:?Please set a value.}"
RUST_LOG: "TRACE"
ports:
- host_ip: "${CLP_API_SERVER_HOST:-127.0.0.1}"
published: "${CLP_API_SERVER_PORT:-3001}"
target: 3001
volumes:
- *volume_clp_config_readonly
- *volume_clp_logs
depends_on:
db-table-creator:
condition: "service_completed_successfully"
results-cache-indices-creator:
condition: "service_completed_successfully"
command: [
"/opt/clp/bin/api_server",
"--host", "0.0.0.0",
"--port", "3001",
"--config", "/etc/clp-config.yml"
]
api-server:
<<: *service_defaults
hostname: "api_server"
environment:
CLP_LOGS_DIR: "/var/log/api_server"
CLP_DB_PASS: "${CLP_DB_PASS:?Please set a value.}"
CLP_DB_USER: "${CLP_DB_USER:?Please set a value.}"
RUST_LOG: "${CLP_API_SERVER_LOGGING_LEVEL:-INFO}"
ports:
- host_ip: "${CLP_API_SERVER_HOST:-127.0.0.1}"
published: "${CLP_API_SERVER_PORT:-3001}"
target: 3001
volumes:
- *volume_clp_config_readonly
- *volume_clp_logs
depends_on:
db-table-creator:
condition: "service_completed_successfully"
results-cache-indices-creator:
condition: "service_completed_successfully"
command: [
"/opt/clp/bin/api_server",
"--host", "0.0.0.0",
"--port", "3001",
"--config", "/etc/clp-config.yml"
]
🤖 Prompt for AI Agents
In tools/deployment/package/docker-compose-all.yaml around lines 490 to 515,
RUST_LOG is hardcoded to "TRACE"; change it to read from an environment variable
with a sensible default (e.g., CLP_RUST_LOG or CLP_API_SERVER_RUST_LOG) so
operators can control log verbosity without editing the compose file. Update the
api-server environment entry to reference the new variable with a default (use
${VAR:-DEFAULT}) and add a line to the project .env or deployment documentation
describing the new variable and recommended default level (e.g., INFO) for
production.

5 changes: 5 additions & 0 deletions tools/deployment/package/docker-compose-base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,8 @@ services:
extends:
file: "docker-compose-all.yaml"
service: "garbage-collector"

api-server:
extends:
file: "docker-compose-all.yaml"
service: "api-server"
5 changes: 5 additions & 0 deletions tools/deployment/package/docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,8 @@ services:
extends:
file: "docker-compose-all.yaml"
service: "mcp-server"

api-server:
extends:
file: "docker-compose-all.yaml"
service: "api-server"
1 change: 1 addition & 0 deletions tools/docker-images/clp-package/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,4 @@ COPY --link --chown=${UID} ./build/spider/spider-build/src/spider/spider_worker
COPY --link --chown=${UID} ./build/nodejs-22/bin/node bin/node-22
COPY --link --chown=${UID} ./build/python-libs/ lib/python3/site-packages/
COPY --link --chown=${UID} ./build/webui/ var/www/webui/
COPY --link --chown=${UID} ./build/rust/release/api_server bin/
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(i know i'm the original author of this source file.. sorry for missing that previously)

now that this has become a long list, shall we alphabetize the COPYs? i believe the effect will be the same because we have specified --link

Loading