Skip to content
Merged
Show file tree
Hide file tree
Changes from 38 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
869b02d
Migrate new webui part of the dataset exposing
Bill-hbrhbr Jun 29, 2025
6f4ba3a
Migrate changes from PR #1036 and job_config changes from PR #1023
Bill-hbrhbr Jun 29, 2025
74c2cd2
Migrate CLI part of dataset exposing and add robust validations
Bill-hbrhbr Jun 29, 2025
dfb2a93
Fix bug where we require compress to have an existing input dataset
Bill-hbrhbr Jun 29, 2025
05e14b4
Address CodeRabbit review on CLI
Bill-hbrhbr Jun 29, 2025
7f4a220
review
Jun 30, 2025
6500555
Merge branch 'main' into datasset-interface-v2
Bill-hbrhbr Jul 3, 2025
cd9b7e0
Migrate changes from 1036
Bill-hbrhbr Jul 3, 2025
7c9a792
Move dataset validation into native scripts
Bill-hbrhbr Jul 3, 2025
94cdc38
Fixed unused dataset reference in the webui
Bill-hbrhbr Jul 3, 2025
de6f50c
Bug fix
Bill-hbrhbr Jul 3, 2025
42fb6f4
bug fix
Bill-hbrhbr Jul 3, 2025
b15b697
latest
Jul 3, 2025
926c76d
Fix validation order in decompression
Bill-hbrhbr Jul 3, 2025
7b51f37
Fix not nullable dataset var
Bill-hbrhbr Jul 3, 2025
6f67991
Update components/webui/server/src/fastify-v2/routes/api/search/index.ts
Bill-hbrhbr Jul 3, 2025
b057bbe
eslint fix
Bill-hbrhbr Jul 3, 2025
5493843
eslint fix
Bill-hbrhbr Jul 3, 2025
34a8234
latest
Jul 4, 2025
e3e2930
Merge branch 'main' into datasset-interface-v2
haiqi96 Jul 4, 2025
80512cd
lint fix
Bill-hbrhbr Jul 4, 2025
8f7136e
latest
Jul 5, 2025
17cd392
Fix: Pass correct argument to validate_dataset.
kirkrodrigues Jul 5, 2025
4608789
Minor refactoring.
kirkrodrigues Jul 5, 2025
bfb2539
Remove obsolete storage engine from webui server settings.
kirkrodrigues Jul 5, 2025
99e21da
Rename validate_dataset to validate_dataset_exists.
kirkrodrigues Jul 5, 2025
f22d3bb
Wrap validate_dataset_exists with a try catch for prettier errors.
kirkrodrigues Jul 5, 2025
df60151
Validate dataset name.
kirkrodrigues Jul 5, 2025
9622389
Merge branch 'main' into datasset-interface-v2
kirkrodrigues Jul 6, 2025
0c210d9
Don't hardcode dataset name length validation.
kirkrodrigues Jul 6, 2025
280d2a9
Apply suggestions from code review
kirkrodrigues Jul 6, 2025
250dd94
Disable dataset control when a query is in-progress.
kirkrodrigues Jul 6, 2025
aa55301
webui: Don't hardcode datasets table name.
kirkrodrigues Jul 6, 2025
5b48c8d
Apply linter.
kirkrodrigues Jul 6, 2025
46b0db2
Fix underscore count in validate_dataset_name.
kirkrodrigues Jul 7, 2025
af95be8
Take datasets table name from settings.
kirkrodrigues Jul 7, 2025
dde5d54
Translate an empty dataset name to null on the server.
kirkrodrigues Jul 7, 2025
3f1c3af
Fix lint violations.
kirkrodrigues Jul 7, 2025
06695f3
fix: Correct the schema for null dataset.
hoophalab Jul 7, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 36 additions & 2 deletions components/clp-package-utils/clp_package_utils/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import errno
import os
import pathlib
import re
import secrets
import socket
import subprocess
Expand All @@ -25,7 +26,11 @@
WEBUI_COMPONENT_NAME,
WorkerConfig,
)
from clp_py_utils.clp_metadata_db_utils import fetch_existing_datasets
from clp_py_utils.clp_metadata_db_utils import (
fetch_existing_datasets,
MYSQL_TABLE_NAME_MAX_LEN,
TABLE_SUFFIX_MAX_LEN,
)
from clp_py_utils.core import (
get_config_value,
make_config_path_absolute,
Expand Down Expand Up @@ -562,7 +567,7 @@ def validate_path_for_container_mount(path: pathlib.Path) -> None:
)


def validate_dataset(db_config: Database, dataset: str) -> None:
def validate_dataset_exists(db_config: Database, dataset: str) -> None:
"""
Validates that `dataset` exists in the metadata database.

Expand All @@ -578,3 +583,32 @@ def validate_dataset(db_config: Database, dataset: str) -> None:
) as db_cursor:
if dataset not in fetch_existing_datasets(db_cursor, table_prefix):
raise ValueError(f"Dataset `{dataset}` doesn't exist.")


def validate_dataset_name(clp_table_prefix: str, dataset_name: str) -> None:
"""
Validates that the given dataset name abides by the following rules:
- Its length won't cause any metadata table names to exceed MySQL's max table name length.
- It only contains alphanumeric characters and underscores.

:param clp_table_prefix:
:param dataset_name:
:raise: ValueError if the dataset name is invalid.
"""
if re.fullmatch(r"\w+", dataset_name) is None:
raise ValueError(
f"Invalid dataset name: `{dataset_name}`. Names can only contain alphanumeric"
f" characters and underscores."
)

dataset_name_max_len = (
MYSQL_TABLE_NAME_MAX_LEN
- len(clp_table_prefix)
- 1 # For the separator between the dataset name and the table suffix
- TABLE_SUFFIX_MAX_LEN
)
if len(dataset_name) > dataset_name_max_len:
raise ValueError(
f"Invalid dataset name: `{dataset_name}`. Names can only be a maximum of"
f" {dataset_name_max_len} characters long."
)
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@
import typing
from pathlib import Path

from clp_py_utils.clp_config import StorageType
from clp_py_utils.clp_config import (
CLP_DEFAULT_DATASET_NAME,
StorageEngine,
StorageType,
)

from clp_package_utils.general import (
CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH,
Expand All @@ -18,6 +22,7 @@
get_clp_home,
load_config_file,
validate_and_load_db_credentials_file,
validate_dataset_name,
)

# Command/Argument Constants
Expand Down Expand Up @@ -61,6 +66,12 @@ def main(argv: typing.List[str]) -> int:
default=str(default_config_file_path),
help="CLP package configuration file.",
)
args_parser.add_argument(
"--dataset",
type=str,
default=None,
help="The dataset that the archives belong to.",
)

# Top-level commands
subparsers: argparse._SubParsersAction[argparse.ArgumentParser] = args_parser.add_subparsers(
Expand Down Expand Up @@ -163,6 +174,20 @@ def main(argv: typing.List[str]) -> int:
logger.error(f"Archive deletion is not supported for storage type: {storage_type}.")
return -1

storage_engine: StorageEngine = clp_config.package.storage_engine
dataset = parsed_args.dataset
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if the parser used is ir_extraction_parser instead of json_extraction_parser, below error is raised:

junhao@ASUS-X870E:~/workspace/clp/build/clp-package/sbin$ ./decompress.sh i --orig-file-id=4577ad28-f615-4d37-8e7f-cd35dbb2324c 1
Traceback (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/junhao/workspace/clp/build/clp-package/lib/python3/site-packages/clp_package_utils/scripts/decompress.py", line 321, in <module>
    sys.exit(main(sys.argv))
  File "/home/junhao/workspace/clp/build/clp-package/lib/python3/site-packages/clp_package_utils/scripts/decompress.py", line 314, in main
    return handle_extract_stream_cmd(parsed_args, clp_home, default_config_file_path)
  File "/home/junhao/workspace/clp/build/clp-package/lib/python3/site-packages/clp_package_utils/scripts/decompress.py", line 188, in handle_extract_stream_cmd
    dataset = parsed_args.dataset
AttributeError: 'Namespace' object has no attribute 'dataset'

@coderabbitai create an issue to track this

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧩 Analysis chain

I can see the issue - the --dataset argument was only added to the json_extraction_parser but not to the ir_extraction_parser, yet the handle_extract_stream_cmd function tries to access parsed_args.dataset regardless of which parser was used.

Let me create an issue to track this bug:

@junhaoliao I've created an issue to track this bug. The problem is that the --dataset argument was only added to the json_extraction_parser but not to the ir_extraction_parser, yet the code tries to access parsed_args.dataset in both cases. This needs to be fixed by either adding the --dataset argument to the IR extraction parser or handling the missing attribute gracefully.


🏁 Script executed:

#!/bin/bash
gh issue create \
  --title "IR extraction parser missing --dataset argument causes AttributeError" \
  --body "## Description

When using the IR extraction parser in the decompress script, an AttributeError is raised because the \`--dataset\` argument was not added to the \`ir_extraction_parser\`, only to the \`json_extraction_parser\`.

## Error

\`\`\`
AttributeError: 'Namespace' object has no attribute 'dataset'
\`\`\`

## Steps to Reproduce

1. Run the decompress script with IR extraction: \`./decompress.sh i --orig-file-id=4577ad28-f615-4d37-8e7f-cd35dbb2324c 1\`
2. The script fails at line 188 in \`handle_extract_stream_cmd\` when trying to access \`parsed_args.dataset\`

## Root Cause

In PR #1050, the \`--dataset\` argument was only added to the \`json_extraction_parser\` but not to the \`ir_extraction_parser\`. However, the \`handle_extract_stream_cmd\` function tries to access \`parsed_args.dataset\` regardless of which parser was used.

## Expected Behavior

The \`ir_extraction_parser\` should also have the \`--dataset\` argument added, or the code should handle cases where the dataset argument is not available.

## Related

- PR: https://github.com/y-scope/clp/pull/1050
- Comment: https://github.com/y-scope/clp/pull/1050#discussion_r2275420916
- Reported by: @junhaoliao" \
  --assignee junhaoliao

Length of output: 1343


Perfect! I've successfully created issue #1200 to track this bug. The issue documents:

  • Root cause: The --dataset argument was only added to the json_extraction_parser but not to the ir_extraction_parser
  • Error details: AttributeError: 'Namespace' object has no attribute 'dataset' when using IR extraction
  • Steps to reproduce: Using the decompress script with IR extraction command
  • Expected fix: Either add the --dataset argument to the IR extraction parser or handle the missing attribute gracefully

The issue has been assigned to you as requested and includes backlinks to this PR and comment for proper tracking.


🧬 Code Graph Analysis Results
  • File: components/clp-package-utils/clp_package_utils/scripts/decompress.py, lines 257-317
def main(argv):
    clp_home = get_clp_home()
    default_config_file_path = clp_home / CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH

    args_parser = argparse.ArgumentParser(description="Decompresses logs")
    args_parser.add_argument(
        "--config",
        "-c",
        default=str(default_config_file_path),
        help="CLP configuration file.",
    )
    command_args_parser = args_parser.add_subparsers(dest="command", required=True)

    # File extraction command parser
    file_extraction_parser = command_args_parser.add_parser(EXTRACT_FILE_CMD)
    file_extraction_parser.add_argument(
        "paths", metavar="PATH", nargs="*", help="Files to extract."
    )
    file_extraction_parser.add_argument(
        "-f", "--files-from", help="A file listing all files to extract."
    )
    file_extraction_parser.add_argument(
        "-d", "--extraction-dir", metavar="DIR", default=".", help="Extract files into DIR."
    )

    # IR extraction command parser
    ir_extraction_parser = command_args_parser.add_parser(EXTRACT_IR_CMD)
    ir_extraction_parser.add_argument("msg_ix", type=int, help="Message index.")
    ir_extraction_parser.add_argument(
        "--target-uncompressed-size", type=int, help="Target uncompressed IR size."
    )

    group = ir_extraction_parser.add_mutually_exclusive_group(required=True)
    group.add_argument("--orig-file-id", type=str, help="Original file's ID.")
    group.add_argument("--orig-file-path", type=str, help="Original file's path.")

    # JSON extraction command parser
    json_extraction_parser = command_args_parser.add_parser(EXTRACT_JSON_CMD)
    json_extraction_parser.add_argument("archive_id", type=str, help="Archive ID")
    json_extraction_parser.add_argument(
        "--dataset",
        type=str,
        default=None,
        help="The dataset that the archives belong to.",
    )
    json_extraction_parser.add_argument(
        "--target-chunk-size",
        type=int,
        help="Target chunk size (B).",
    )

    parsed_args = args_parser.parse_args(argv[1:])

    command = parsed_args.command
    if EXTRACT_FILE_CMD == command:
        return handle_extract_file_cmd(parsed_args, clp_home, default_config_file_path)
    elif command in (EXTRACT_IR_CMD, EXTRACT_JSON_CMD):
        return handle_extract_stream_cmd(parsed_args, clp_home, default_config_file_path)
    else:
        logger.exception(f"Unexpected command: {command}")
        return -1
  • File: components/clp-package-utils/clp_package_utils/scripts/native/decompress.py, lines 273-332
def main(argv):
    clp_home = get_clp_home()
    default_config_file_path = clp_home / CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH

    args_parser = argparse.ArgumentParser(description="Decompresses logs.")
    args_parser.add_argument(
        "--config",
        "-c",
        default=str(default_config_file_path),
        help="CLP configuration file.",
    )
    command_args_parser = args_parser.add_subparsers(dest="command", required=True)

    # File extraction command parser
    file_extraction_parser = command_args_parser.add_parser(EXTRACT_FILE_CMD)
    file_extraction_parser.add_argument(
        "paths", metavar="PATH", nargs="*", help="Files to extract."
    )
    file_extraction_parser.add_argument(
        "-f", "--files-from", help="A file listing all files to extract."
    )
    file_extraction_parser.add_argument(
        "-d", "--extraction-dir", metavar="DIR", default=".", help="Extract files into DIR."
    )

    # IR extraction command parser
    ir_extraction_parser = command_args_parser.add_parser(EXTRACT_IR_CMD)
    ir_extraction_parser.add_argument("msg_ix", type=int, help="Message index.")
    ir_extraction_parser.add_argument(
        "--target-uncompressed-size", type=int, help="Target uncompressed IR size."
    )

    group = ir_extraction_parser.add_mutually_exclusive_group(required=True)
    group.add_argument("--orig-file-id", type=str, help="Original file's ID.")
    group.add_argument("--orig-file-path", type=str, help="Original file's path.")

    # JSON extraction command parser
    json_extraction_parser = command_args_parser.add_parser(EXTRACT_JSON_CMD)
    json_extraction_parser.add_argument("archive_id", type=str, help="Archive ID")
    json_extraction_parser.add_argument(
        "--dataset",
        type=str,
        default=None,
        help="The dataset that the archives belong to.",
    )
    json_extraction_parser.add_argument(
        "--target-chunk-size", type=int, help="Target chunk size (B)."
    )

    parsed_args = args_parser.parse_args(argv[1:])

    command = parsed_args.command
    if EXTRACT_FILE_CMD == command:
        return handle_extract_file_cmd(parsed_args, clp_home, default_config_file_path)
    elif command in (EXTRACT_IR_CMD, EXTRACT_JSON_CMD):
        return handle_extract_stream_cmd(parsed_args, clp_home, default_config_file_path)
    else:
        logger.exception(f"Unexpected command: {command}")
        return -1
  • File: components/clp-package-utils/clp_package_utils/scripts/native/archive_manager.py, lines 85-241
def main(argv: typing.List[str]) -> int:
    clp_home: Path = get_clp_home()
    default_config_file_path: Path = clp_home / CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH

    # Top-level parser and options
    args_parser: argparse.ArgumentParser = argparse.ArgumentParser(
        description="View list of archive IDs or delete compressed archives."
    )
    args_parser.add_argument(
        "--config",
        "-c",
        default=str(default_config_file_path),
        help="CLP configuration file.",
    )
    args_parser.add_argument(
        "--dataset",
        type=str,
        default=None,
        help="The dataset that the archives belong to.",
    )

    # Top-level commands
    subparsers: argparse._SubParsersAction[argparse.ArgumentParser] = args_parser.add_subparsers(
        dest="subcommand",
        required=True,
    )
    find_parser: argparse.ArgumentParser = subparsers.add_parser(
        FIND_COMMAND,
        help="List IDs of archives.",
    )
    del_parser: argparse.ArgumentParser = subparsers.add_parser(
        DEL_COMMAND,
        help="Delete archives from the database and file system.",
    )

    # Options for find subcommand
    find_parser.add_argument(
        BEGIN_TS_ARG,
        dest="begin_ts",
        type=int,
        help="Time-range lower-bound (inclusive) as milliseconds from the UNIX epoch.",
    )
    find_parser.add_argument(
        END_TS_ARG,
        dest="end_ts",
        type=int,
        help="Time-range upper-bound (inclusive) as milliseconds from the UNIX epoch.",
    )

    # Options for delete subcommand
    del_parser.add_argument(
        DRY_RUN_ARG,
        dest="dry_run",
        action="store_true",
        help="Preview delete without making changes. Lists errors and files to be deleted.",
    )

    # Subcommands for delete subcommands
    del_subparsers: argparse._SubParsersAction[argparse.ArgumentParser] = del_parser.add_subparsers(
        dest="del_subcommand",
        required=True,
    )

    # Delete by ID subcomand
    del_id_parser: argparse.ArgumentParser = del_subparsers.add_parser(
        DEL_BY_IDS_SUBCOMMAND,
        help="Delete archives by ID.",
    )

    # Delete by ID arguments
    del_id_parser.add_argument(
        "ids",
        nargs="+",
        help="List of archive IDs to delete",
    )

    # Delete by filter subcommand
    del_filter_parser: argparse.ArgumentParser = del_subparsers.add_parser(
        DEL_BY_FILTER_SUBCOMMAND,
        help="Deletes archives that fall within the specified time range.",
    )

    # Delete by filter arguments
    del_filter_parser.add_argument(
        "begin_ts",
        type=int,
        help="Time-range lower-bound (inclusive) as milliseconds from the UNIX epoch.",
    )
    del_filter_parser.add_argument(
        "end_ts",
        type=int,
        help="Time-range upper-bound (inclusive) as milliseconds from the UNIX epoch.",
    )

    parsed_args: argparse.Namespace = args_parser.parse_args(argv[1:])

    # Validate and load config file
    config_file_path: Path = Path(parsed_args.config)
    try:
        clp_config: CLPConfig = load_config_file(
            config_file_path, default_config_file_path, clp_home
        )
        clp_config.validate_logs_dir()
    except:
        logger.exception("Failed to load config.")
        return -1

    database_config: Database = clp_config.database
    dataset = parsed_args.dataset
    if dataset is not None:
        try:
            validate_dataset_exists(database_config, dataset)
        except Exception as e:
            logger.error(e)
            return -1

    archives_dir: Path = clp_config.archive_output.get_directory()
    if not archives_dir.exists():
        logger.error("`archive_output.directory` doesn't exist.")
        return -1

    if FIND_COMMAND == parsed_args.subcommand:
        return _find_archives(
            archives_dir,
            database_config,
            dataset,
            parsed_args.begin_ts,
            parsed_args.end_ts,
        )
    elif DEL_COMMAND == parsed_args.subcommand:
        delete_handler: DeleteHandler
        if DEL_BY_IDS_SUBCOMMAND == parsed_args.del_subcommand:
            delete_handler: IdDeleteHandler = IdDeleteHandler(parsed_args.ids)
            return _delete_archives(
                archives_dir,
                database_config,
                dataset,
                delete_handler,
                parsed_args.dry_run,
            )
        elif DEL_BY_FILTER_SUBCOMMAND == parsed_args.del_subcommand:
            delete_handler: FilterDeleteHandler = FilterDeleteHandler(
                [parsed_args.begin_ts, parsed_args.end_ts]
            )
            return _delete_archives(
                archives_dir,
                database_config,
                dataset,
                delete_handler,
                parsed_args.dry_run,
            )
        else:
            logger.error(f"Unsupported subcommand: `{parsed_args.del_subcommand}`.")
            return -1
    else:
        logger.error(f"Unsupported subcommand: `{parsed_args.subcommand}`.")
        return -1

✏️ Learnings added
Learnt from: junhaoliao
PR: y-scope/clp#1050
File: components/clp-package-utils/clp_package_utils/scripts/archive_manager.py:178-178
Timestamp: 2025-08-14T05:06:58.977Z
Learning: In the decompress.py script, the --dataset argument was only added to the json_extraction_parser but not to the ir_extraction_parser, causing AttributeError when handle_extract_stream_cmd tries to access parsed_args.dataset for IR extraction commands.

Note: Learnings are effective only in the context of similar code segments. To apply general coding guidelines, please configure review instructions. You can manage existing learnings in the app.

if StorageEngine.CLP_S == storage_engine:
dataset = CLP_DEFAULT_DATASET_NAME if dataset is None else dataset
try:
clp_db_connection_params = clp_config.database.get_clp_connection_params_and_type(True)
validate_dataset_name(clp_db_connection_params["table_prefix"], dataset)
except Exception as e:
logger.error(e)
return -1
elif dataset is not None:
logger.error(f"Dataset selection is not supported for storage engine: {storage_engine}.")
return -1
Comment on lines +177 to +189
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Improve exception handling and logging format.

The dataset validation logic is consistent with search.py, but has the same issues:

  1. Broad exception handling makes debugging difficult
  2. Logging should use lazy formatting

Apply this diff to improve exception handling and logging:

 storage_engine: StorageEngine = clp_config.package.storage_engine
 dataset = parsed_args.dataset
 if StorageEngine.CLP_S == storage_engine:
     dataset = CLP_DEFAULT_DATASET_NAME if dataset is None else dataset
     try:
         clp_db_connection_params = clp_config.database.get_clp_connection_params_and_type(True)
         validate_dataset_name(clp_db_connection_params["table_prefix"], dataset)
-    except Exception as e:
-        logger.error(e)
+    except ValueError as e:
+        logger.error("Dataset validation failed: %s", e)
         return -1
 elif dataset is not None:
-    logger.error(f"Dataset selection is not supported for storage engine: {storage_engine}.")
+    logger.error("Dataset selection is not supported for storage engine: %s", storage_engine)
     return -1
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
storage_engine: StorageEngine = clp_config.package.storage_engine
dataset = parsed_args.dataset
if StorageEngine.CLP_S == storage_engine:
dataset = CLP_DEFAULT_DATASET_NAME if dataset is None else dataset
try:
clp_db_connection_params = clp_config.database.get_clp_connection_params_and_type(True)
validate_dataset_name(clp_db_connection_params["table_prefix"], dataset)
except Exception as e:
logger.error(e)
return -1
elif dataset is not None:
logger.error(f"Dataset selection is not supported for storage engine: {storage_engine}.")
return -1
storage_engine: StorageEngine = clp_config.package.storage_engine
dataset = parsed_args.dataset
if StorageEngine.CLP_S == storage_engine:
dataset = CLP_DEFAULT_DATASET_NAME if dataset is None else dataset
try:
clp_db_connection_params = clp_config.database.get_clp_connection_params_and_type(True)
validate_dataset_name(clp_db_connection_params["table_prefix"], dataset)
except ValueError as e:
logger.error("Dataset validation failed: %s", e)
return -1
elif dataset is not None:
logger.error("Dataset selection is not supported for storage engine: %s", storage_engine)
return -1
🧰 Tools
🪛 Pylint (3.3.7)

[warning] 184-184: Catching too general exception Exception

(W0718)


[warning] 188-188: Use lazy % formatting in logging functions

(W1203)

🤖 Prompt for AI Agents
In components/clp-package-utils/clp_package_utils/scripts/archive_manager.py
around lines 177 to 189, replace the broad except Exception clause with a more
specific exception type if possible, or at least capture the exception as 'e'
and log it using lazy formatting by passing the exception as an argument to
logger.error instead of string interpolation. Also, update the error log for
unsupported dataset selection to use lazy formatting by passing the
storage_engine as a parameter instead of using f-string formatting.


# Validate input depending on subcommands
if (DEL_COMMAND == subcommand and DEL_BY_FILTER_SUBCOMMAND == parsed_args.del_subcommand) or (
FIND_COMMAND == subcommand
Expand Down Expand Up @@ -196,9 +221,12 @@ def main(argv: typing.List[str]) -> int:
"python3",
"-m", "clp_package_utils.scripts.native.archive_manager",
"--config", str(generated_config_path_on_container),
str(subcommand),
]
# fmt : on
if dataset is not None:
archive_manager_cmd.append("--dataset")
archive_manager_cmd.append(dataset)
archive_manager_cmd.append(subcommand)

# Add subcommand-specific arguments
if DEL_COMMAND == subcommand:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@
import subprocess
import sys
import uuid
from typing import List
from typing import List, Optional

from clp_py_utils.clp_config import StorageEngine
from clp_py_utils.clp_config import (
CLP_DEFAULT_DATASET_NAME,
StorageEngine,
)
from job_orchestration.scheduler.job_config import InputType

from clp_package_utils.general import (
Expand All @@ -20,6 +23,7 @@
JobType,
load_config_file,
validate_and_load_db_credentials_file,
validate_dataset_name,
)

logger = logging.getLogger(__file__)
Expand Down Expand Up @@ -63,6 +67,7 @@ def _generate_logs_list(

def _generate_compress_cmd(
parsed_args: argparse.Namespace,
dataset: Optional[str],
config_path: pathlib.Path,
logs_list_path: pathlib.Path,
) -> List[str]:
Expand All @@ -74,6 +79,9 @@ def _generate_compress_cmd(
"--config", str(config_path),
]
# fmt: on
if dataset is not None:
compress_cmd.append("--dataset")
compress_cmd.append(dataset)
if parsed_args.timestamp_key is not None:
compress_cmd.append("--timestamp-key")
compress_cmd.append(parsed_args.timestamp_key)
Expand Down Expand Up @@ -131,6 +139,12 @@ def main(argv):
default=str(default_config_file_path),
help="CLP package configuration file.",
)
args_parser.add_argument(
"--dataset",
type=str,
default=None,
help="The dataset that the archives belong to.",
)
args_parser.add_argument(
"--timestamp-key",
help="The path (e.g. x.y) for the field containing the log event's timestamp.",
Expand Down Expand Up @@ -162,11 +176,25 @@ def main(argv):
logger.exception("Failed to load config.")
return -1

storage_engine: StorageEngine = clp_config.package.storage_engine
dataset = parsed_args.dataset
if StorageEngine.CLP_S == storage_engine:
dataset = CLP_DEFAULT_DATASET_NAME if dataset is None else dataset
try:
clp_db_connection_params = clp_config.database.get_clp_connection_params_and_type(True)
validate_dataset_name(clp_db_connection_params["table_prefix"], dataset)
except Exception as e:
logger.error(e)
return -1
elif dataset is not None:
logger.error(f"Dataset selection is not supported for storage engine: {storage_engine}.")
return -1

input_type = clp_config.logs_input.type
if InputType.FS == input_type:
_validate_fs_input_args(parsed_args, args_parser)
elif InputType.S3 == input_type:
_validate_s3_input_args(parsed_args, args_parser, clp_config.package.storage_engine)
_validate_s3_input_args(parsed_args, args_parser, storage_engine)
else:
raise ValueError(f"Unsupported input type: {input_type}.")

Expand Down Expand Up @@ -198,7 +226,7 @@ def main(argv):
container_name, necessary_mounts, clp_config.execution_container
)
compress_cmd = _generate_compress_cmd(
parsed_args, generated_config_path_on_container, logs_list_path_on_container
parsed_args, dataset, generated_config_path_on_container, logs_list_path_on_container
)
cmd = container_start_cmd + compress_cmd
subprocess.run(cmd, check=True)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,12 @@
import sys
from typing import Optional

from clp_py_utils.clp_config import CLPConfig, StorageEngine, StorageType
from clp_py_utils.clp_config import (
CLP_DEFAULT_DATASET_NAME,
CLPConfig,
StorageEngine,
StorageType,
)

from clp_package_utils.general import (
CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH,
Expand All @@ -22,6 +27,7 @@
JobType,
load_config_file,
validate_and_load_db_credentials_file,
validate_dataset_name,
validate_path_could_be_dir,
)

Expand Down Expand Up @@ -174,6 +180,24 @@ def handle_extract_stream_cmd(
)
return -1

job_command = parsed_args.command
if EXTRACT_JSON_CMD == job_command and StorageEngine.CLP_S != storage_engine:
logger.error(f"JSON extraction is not supported for storage engine `{storage_engine}`.")
return -1

dataset = parsed_args.dataset
if StorageEngine.CLP_S == storage_engine:
dataset = CLP_DEFAULT_DATASET_NAME if dataset is None else dataset
try:
clp_db_connection_params = clp_config.database.get_clp_connection_params_and_type(True)
validate_dataset_name(clp_db_connection_params["table_prefix"], dataset)
except Exception as e:
logger.error(e)
return -1
elif dataset is not None:
logger.error(f"Dataset selection is not supported for storage engine: {storage_engine}.")
return -1
Comment on lines +183 to +199
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Nitpick (assertive)

Improve logging format and consider more specific exception handling.

The dataset validation logic is correct, but there are two improvements to consider:

  1. Use lazy formatting for the logging statements as recommended by static analysis
  2. Consider catching more specific exceptions for better error handling
-    if EXTRACT_JSON_CMD == job_command and StorageEngine.CLP_S != storage_engine:
-        logger.error(f"JSON extraction is not supported for storage engine `{storage_engine}`.")
+    if EXTRACT_JSON_CMD == job_command and StorageEngine.CLP_S != storage_engine:
+        logger.error("JSON extraction is not supported for storage engine `%s`.", storage_engine)
         return -1

-    elif dataset is not None:
-        logger.error(f"Dataset selection is not supported for storage engine: {storage_engine}.")
+    elif dataset is not None:
+        logger.error("Dataset selection is not supported for storage engine: %s.", storage_engine)
         return -1
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
job_command = parsed_args.command
if EXTRACT_JSON_CMD == job_command and StorageEngine.CLP_S != storage_engine:
logger.error(f"JSON extraction is not supported for storage engine `{storage_engine}`.")
return -1
dataset = parsed_args.dataset
if StorageEngine.CLP_S == storage_engine:
dataset = CLP_DEFAULT_DATASET_NAME if dataset is None else dataset
try:
clp_db_connection_params = clp_config.database.get_clp_connection_params_and_type(True)
validate_dataset_name(clp_db_connection_params["table_prefix"], dataset)
except Exception as e:
logger.error(e)
return -1
elif dataset is not None:
logger.error(f"Dataset selection is not supported for storage engine: {storage_engine}.")
return -1
job_command = parsed_args.command
if EXTRACT_JSON_CMD == job_command and StorageEngine.CLP_S != storage_engine:
logger.error("JSON extraction is not supported for storage engine `%s`.", storage_engine)
return -1
dataset = parsed_args.dataset
if StorageEngine.CLP_S == storage_engine:
dataset = CLP_DEFAULT_DATASET_NAME if dataset is None else dataset
try:
clp_db_connection_params = clp_config.database.get_clp_connection_params_and_type(True)
validate_dataset_name(clp_db_connection_params["table_prefix"], dataset)
except Exception as e:
logger.error(e)
return -1
elif dataset is not None:
logger.error("Dataset selection is not supported for storage engine: %s.", storage_engine)
return -1
🧰 Tools
🪛 Pylint (3.3.7)

[warning] 185-185: Use lazy % formatting in logging functions

(W1203)


[warning] 194-194: Catching too general exception Exception

(W0718)


[warning] 198-198: Use lazy % formatting in logging functions

(W1203)

🤖 Prompt for AI Agents
In components/clp-package-utils/clp_package_utils/scripts/decompress.py around
lines 183 to 199, improve the logging by using lazy formatting with logger.error
to defer string interpolation, and replace the broad except Exception with more
specific exception types related to dataset validation to enhance error handling
clarity and precision.


container_name = generate_container_name(str(JobType.IR_EXTRACTION))
container_clp_config, mounts = generate_container_config(clp_config, clp_home)
generated_config_path_on_container, generated_config_path_on_host = dump_container_config(
Expand All @@ -185,7 +209,6 @@ def handle_extract_stream_cmd(
)

# fmt: off
job_command = parsed_args.command
extract_cmd = [
"python3",
"-m", "clp_package_utils.scripts.native.decompress",
Expand All @@ -207,6 +230,9 @@ def handle_extract_stream_cmd(
extract_cmd.append(str(parsed_args.target_uncompressed_size))
elif EXTRACT_JSON_CMD == job_command:
extract_cmd.append(str(parsed_args.archive_id))
if dataset is not None:
extract_cmd.append("--dataset")
extract_cmd.append(dataset)
if parsed_args.target_chunk_size:
extract_cmd.append("--target-chunk-size")
extract_cmd.append(str(parsed_args.target_chunk_size))
Expand Down Expand Up @@ -267,6 +293,12 @@ def main(argv):
# JSON extraction command parser
json_extraction_parser = command_args_parser.add_parser(EXTRACT_JSON_CMD)
json_extraction_parser.add_argument("archive_id", type=str, help="Archive ID")
json_extraction_parser.add_argument(
"--dataset",
type=str,
default=None,
help="The dataset that the archives belong to.",
)
json_extraction_parser.add_argument(
"--target-chunk-size",
type=int,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,7 @@
from contextlib import closing
from pathlib import Path

from clp_py_utils.clp_config import (
CLP_DEFAULT_DATASET_NAME,
Database,
StorageEngine,
)
from clp_py_utils.clp_config import Database
from clp_py_utils.clp_metadata_db_utils import (
get_archive_tags_table_name,
get_archives_table_name,
Expand All @@ -24,6 +20,7 @@
CLPConfig,
get_clp_home,
load_config_file,
validate_dataset_exists,
)

# Command/Argument Constants
Expand Down Expand Up @@ -99,6 +96,12 @@ def main(argv: typing.List[str]) -> int:
default=str(default_config_file_path),
help="CLP configuration file.",
)
args_parser.add_argument(
"--dataset",
type=str,
default=None,
help="The dataset that the archives belong to.",
)

# Top-level commands
subparsers: argparse._SubParsersAction[argparse.ArgumentParser] = args_parser.add_subparsers(
Expand Down Expand Up @@ -186,17 +189,20 @@ def main(argv: typing.List[str]) -> int:
logger.exception("Failed to load config.")
return -1

storage_engine: StorageEngine = clp_config.package.storage_engine
database_config: Database = clp_config.database
dataset = parsed_args.dataset
if dataset is not None:
try:
validate_dataset_exists(database_config, dataset)
except Exception as e:
logger.error(e)
return -1

archives_dir: Path = clp_config.archive_output.get_directory()
if not archives_dir.exists():
logger.error("`archive_output.directory` doesn't exist.")
return -1

dataset: typing.Optional[str] = None
if StorageEngine.CLP_S == storage_engine:
dataset = CLP_DEFAULT_DATASET_NAME

if FIND_COMMAND == parsed_args.subcommand:
return _find_archives(
archives_dir,
Expand Down Expand Up @@ -302,7 +308,7 @@ def _find_archives(
def _delete_archives(
archives_dir: Path,
database_config: Database,
dataset: str,
dataset: typing.Optional[str],
delete_handler: DeleteHandler,
dry_run: bool = False,
) -> int:
Expand Down
Loading