diff --git a/.gitignore b/.gitignore index c481ee5..d6408eb 100644 --- a/.gitignore +++ b/.gitignore @@ -24,4 +24,6 @@ file_generator.py .env.local Pipfile test/ -logs \ No newline at end of file +logs +ai_testing/ +verify_find_files_lazy_loading.py diff --git a/README.md b/README.md index 175422c..07c8797 100644 --- a/README.md +++ b/README.md @@ -7,10 +7,10 @@ The Socket Security CLI was created to enable integrations with other tools like ```` shell socketcli [-h] [--api-token API_TOKEN] [--repo REPO] [--integration {api,github,gitlab}] [--owner OWNER] [--branch BRANCH] [--committers [COMMITTERS ...]] [--pr-number PR_NUMBER] [--commit-message COMMIT_MESSAGE] [--commit-sha COMMIT_SHA] - [--target-path TARGET_PATH] [--sbom-file SBOM_FILE] [--files FILES] [--default-branch] [--pending-head] - [--generate-license] [--enable-debug] [--enable-json] [--enable-sarif] [--disable-overview] [--disable-security-issue] - [--allow-unverified] [--ignore-commit-files] [--disable-blocking] [--scm SCM] [--timeout TIMEOUT] - [--exclude-license-details] + [--target-path TARGET_PATH] [--sbom-file SBOM_FILE] [--files FILES] [--save-submitted-files-list SAVE_SUBMITTED_FILES_LIST] + [--default-branch] [--pending-head] [--generate-license] [--enable-debug] [--enable-json] [--enable-sarif] + [--disable-overview] [--disable-security-issue] [--allow-unverified] [--ignore-commit-files] [--disable-blocking] + [--scm SCM] [--timeout TIMEOUT] [--exclude-license-details] ```` If you don't want to provide the Socket API Token every time then you can use the environment variable `SOCKET_SECURITY_API_KEY` @@ -40,13 +40,15 @@ If you don't want to provide the Socket API Token every time then you can use th | --commit-sha | False | "" | Commit SHA | #### Path and File -| Parameter | Required | Default | Description | -|:----------------------|:---------|:----------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| --target-path | False | ./ | Target path for analysis | -| --sbom-file | False | | SBOM file path | -| --files | False | [] | Files to analyze (JSON array string) | -| --excluded-ecosystems | False | [] | List of ecosystems to exclude from analysis (JSON array string). You can get supported files from the [Supported Files API](https://docs.socket.dev/reference/getsupportedfiles) | -| --license-file-name | False | `license_output.json` | Name of the file to save the license details to if enabled | +| Parameter | Required | Default | Description | +|:----------------------------|:---------|:----------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| --target-path | False | ./ | Target path for analysis | +| --sbom-file | False | | SBOM file path | +| --files | False | [] | Files to analyze (JSON array string) | +| --excluded-ecosystems | False | [] | List of ecosystems to exclude from analysis (JSON array string). You can get supported files from the [Supported Files API](https://docs.socket.dev/reference/getsupportedfiles) | +| --license-file-name | False | `license_output.json` | Name of the file to save the license details to if enabled | +| --save-submitted-files-list | False | | Save list of submitted file names to JSON file for debugging purposes | +| --save-manifest-tar | False | | Save all manifest files to a compressed tar.gz archive with original directory structure | #### Branch and Scan Configuration | Parameter | Required | Default | Description | @@ -133,6 +135,73 @@ The CLI determines which files to scan based on the following logic: - **Using `--files`**: If you specify `--files '["package.json"]'`, the CLI will check if this file exists and is a manifest file before triggering a scan. - **Using `--ignore-commit-files`**: This forces a scan of all manifest files in the target path, regardless of what's in your commit. +## Debugging and Troubleshooting + +### Saving Submitted Files List + +The CLI provides a debugging option to save the list of files that were submitted for scanning: + +```bash +socketcli --save-submitted-files-list submitted_files.json +``` + +This will create a JSON file containing: +- Timestamp of when the scan was performed +- Total number of files submitted +- Total size of all files (in bytes and human-readable format) +- Complete list of file paths that were found and submitted for scanning + +Example output file: +```json +{ + "timestamp": "2025-01-22 10:30:45 UTC", + "total_files": 3, + "total_size_bytes": 2048, + "total_size_human": "2.00 KB", + "files": [ + "./package.json", + "./requirements.txt", + "./Pipfile" + ] +} +``` + +This feature is useful for: +- **Debugging**: Understanding which files the CLI found and submitted +- **Verification**: Confirming that expected manifest files are being detected +- **Size Analysis**: Understanding the total size of manifest files being uploaded +- **Troubleshooting**: Identifying why certain files might not be included in scans or if size limits are being hit + +> **Note**: This option works with both differential scans (when git commits are detected) and full scans (API mode). + +### Saving Manifest Files Archive + +For backup, sharing, or analysis purposes, you can save all manifest files to a compressed tar.gz archive: + +```bash +socketcli --save-manifest-tar manifest_files.tar.gz +``` + +This will create a compressed archive containing all the manifest files that were found and submitted for scanning, preserving their original directory structure relative to the scanned directory. + +Example usage with other options: +```bash +# Save both files list and archive +socketcli --save-submitted-files-list files.json --save-manifest-tar backup.tar.gz + +# Use with specific target path +socketcli --target-path ./my-project --save-manifest-tar my-project-manifests.tar.gz +``` + +The manifest archive feature is useful for: +- **Backup**: Creating portable backups of all dependency manifest files +- **Sharing**: Sending the exact files being analyzed to colleagues or support +- **Analysis**: Examining the dependency files offline or with other tools +- **Debugging**: Verifying file discovery and content issues +- **Compliance**: Maintaining records of scanned dependency files + +> **Note**: The tar.gz archive preserves the original directory structure, making it easy to extract and examine the files in their proper context. + ## Development This project uses `pyproject.toml` as the primary dependency specification. diff --git a/pyproject.toml b/pyproject.toml index 7027425..0339b2f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "hatchling.build" [project] name = "socketsecurity" -version = "2.1.21" +version = "2.1.23" requires-python = ">= 3.10" license = {"file" = "LICENSE"} dependencies = [ diff --git a/socketsecurity/__init__.py b/socketsecurity/__init__.py index dd5d61a..b2c1d88 100644 --- a/socketsecurity/__init__.py +++ b/socketsecurity/__init__.py @@ -1,2 +1,2 @@ __author__ = 'socket.dev' -__version__ = '2.1.21' +__version__ = '2.1.23' diff --git a/socketsecurity/config.py b/socketsecurity/config.py index dae0745..817c7da 100644 --- a/socketsecurity/config.py +++ b/socketsecurity/config.py @@ -57,6 +57,8 @@ class CliConfig: jira_plugin: PluginConfig = field(default_factory=PluginConfig) slack_plugin: PluginConfig = field(default_factory=PluginConfig) license_file_name: str = "license_output.json" + save_submitted_files_list: Optional[str] = None + save_manifest_tar: Optional[str] = None @classmethod def from_args(cls, args_list: Optional[List[str]] = None) -> 'CliConfig': @@ -101,6 +103,8 @@ def from_args(cls, args_list: Optional[List[str]] = None) -> 'CliConfig': 'repo_is_public': args.repo_is_public, "excluded_ecosystems": args.excluded_ecosystems, 'license_file_name': args.license_file_name, + 'save_submitted_files_list': args.save_submitted_files_list, + 'save_manifest_tar': args.save_manifest_tar, 'version': __version__ } try: @@ -262,6 +266,18 @@ def create_argument_parser() -> argparse.ArgumentParser: metavar="", help="SBOM file path" ) + path_group.add_argument( + "--save-submitted-files-list", + dest="save_submitted_files_list", + metavar="", + help="Save list of submitted file names to JSON file for debugging purposes" + ) + path_group.add_argument( + "--save-manifest-tar", + dest="save_manifest_tar", + metavar="", + help="Save all manifest files to a compressed tar.gz archive with original directory structure" + ) path_group.add_argument( "--files", metavar="", diff --git a/socketsecurity/core/__init__.py b/socketsecurity/core/__init__.py index 257b014..7688299 100644 --- a/socketsecurity/core/__init__.py +++ b/socketsecurity/core/__init__.py @@ -1,14 +1,15 @@ import logging import os import sys +import tarfile import time import io +import json from dataclasses import asdict from glob import glob from io import BytesIO from pathlib import PurePath from typing import BinaryIO, Dict, List, Tuple, Set, Union -import re from socketdev import socketdev from socketdev.exceptions import APIFailure from socketdev.fullscans import FullScanParams, SocketArtifact @@ -28,6 +29,8 @@ from socketsecurity.core.exceptions import APIResourceNotFound from .socket_config import SocketConfig from .utils import socket_globs +from .resource_utils import check_file_count_against_ulimit +from .lazy_file_loader import load_files_for_sending_lazy import importlib logging_std = importlib.import_module("logging") @@ -176,6 +179,114 @@ def is_excluded(file_path: str, excluded_dirs: Set[str]) -> bool: return True return False + def save_submitted_files_list(self, files: List[str], output_path: str) -> None: + """ + Save the list of submitted file names to a JSON file for debugging. + + Args: + files: List of file paths that were submitted for scanning + output_path: Path where to save the JSON file + """ + try: + # Calculate total size of all files + total_size_bytes = 0 + valid_files = [] + + for file_path in files: + try: + if os.path.exists(file_path) and os.path.isfile(file_path): + file_size = os.path.getsize(file_path) + total_size_bytes += file_size + valid_files.append(file_path) + else: + log.warning(f"File not found or not accessible: {file_path}") + valid_files.append(file_path) # Still include in list for debugging + except OSError as e: + log.warning(f"Error accessing file {file_path}: {e}") + valid_files.append(file_path) # Still include in list for debugging + + # Convert bytes to human-readable format + def format_bytes(bytes_value): + """Convert bytes to human readable format""" + for unit in ['B', 'KB', 'MB', 'GB']: + if bytes_value < 1024.0: + return f"{bytes_value:.2f} {unit}" + bytes_value /= 1024.0 + return f"{bytes_value:.2f} TB" + + file_data = { + "timestamp": time.strftime("%Y-%m-%d %H:%M:%S UTC", time.gmtime()), + "total_files": len(valid_files), + "total_size_bytes": total_size_bytes, + "total_size_human": format_bytes(total_size_bytes), + "files": sorted(valid_files) + } + + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(file_data, f, indent=2, ensure_ascii=False) + + log.info(f"Saved list of {len(valid_files)} submitted files ({file_data['total_size_human']}) to: {output_path}") + + except Exception as e: + log.error(f"Failed to save submitted files list to {output_path}: {e}") + + def save_manifest_tar(self, files: List[str], output_path: str, base_dir: str) -> None: + """ + Save all manifest files to a compressed tar.gz archive with original directory structure. + + Args: + files: List of file paths to include in the archive + output_path: Path where to save the tar.gz file + base_dir: Base directory to preserve relative structure + """ + try: + # Normalize base directory + base_dir = os.path.abspath(base_dir) + if not base_dir.endswith(os.sep): + base_dir += os.sep + + log.info(f"Creating manifest tar.gz file: {output_path}") + log.debug(f"Base directory: {base_dir}") + + with tarfile.open(output_path, 'w:gz') as tar: + for file_path in files: + if not os.path.exists(file_path): + log.warning(f"File not found, skipping: {file_path}") + continue + + # Calculate relative path within the base directory + abs_file_path = os.path.abspath(file_path) + if abs_file_path.startswith(base_dir): + # File is within base directory - use relative path + arcname = os.path.relpath(abs_file_path, base_dir) + else: + # File is outside base directory - use just the filename + arcname = os.path.basename(abs_file_path) + log.warning(f"File outside base dir, using basename: {file_path} -> {arcname}") + + # Normalize archive name to use forward slashes + arcname = arcname.replace(os.sep, '/') + + log.debug(f"Adding to tar: {file_path} -> {arcname}") + tar.add(file_path, arcname=arcname) + + # Get tar file size for logging + tar_size = os.path.getsize(output_path) + + def format_bytes(bytes_value): + """Convert bytes to human readable format""" + for unit in ['B', 'KB', 'MB', 'GB']: + if bytes_value < 1024.0: + return f"{bytes_value:.2f} {unit}" + bytes_value /= 1024.0 + return f"{bytes_value:.2f} TB" + + tar_size_human = format_bytes(tar_size) + log.info(f"Successfully created tar.gz with {len(files)} files ({tar_size_human}, {tar_size:,} bytes): {output_path}") + + except Exception as e: + log.error(f"Failed to save manifest tar.gz to {output_path}: {e}") + def find_files(self, path: str) -> List[str]: """ Finds supported manifest files in the given path. @@ -196,7 +307,7 @@ def find_files(self, path: str) -> List[str]: for ecosystem in patterns: if ecosystem in self.config.excluded_ecosystems: continue - log.info(f'Scanning ecosystem: {ecosystem}') + log.debug(f'Scanning ecosystem: {ecosystem}') ecosystem_patterns = patterns[ecosystem] for file_name in ecosystem_patterns: original_pattern = ecosystem_patterns[file_name]["pattern"] @@ -219,8 +330,24 @@ def find_files(self, path: str) -> List[str]: glob_end = time.time() log.debug(f"Globbing took {glob_end - glob_start:.4f} seconds") - log.info(f"Total files found: {len(files)}") - return sorted(files) + file_list = sorted(files) + file_count = len(file_list) + log.info(f"Total files found: {file_count}") + + # Check if the number of manifest files might exceed ulimit -n + ulimit_check = check_file_count_against_ulimit(file_count) + if ulimit_check["can_check"]: + if ulimit_check["would_exceed"]: + log.warning(f"Found {file_count} manifest files, which may exceed the file descriptor limit (ulimit -n = {ulimit_check['soft_limit']})") + log.warning(f"Available file descriptors: {ulimit_check['available_fds']} (after {ulimit_check['buffer_size']} buffer)") + log.warning(f"Recommendation: {ulimit_check['recommendation']}") + log.warning("This may cause 'Too many open files' errors during processing") + else: + log.debug(f"File count ({file_count}) is within file descriptor limit ({ulimit_check['soft_limit']})") + else: + log.debug(f"Could not check file descriptor limit: {ulimit_check.get('error', 'Unknown error')}") + + return file_list def get_supported_patterns(self) -> Dict: """ @@ -273,6 +400,18 @@ def has_manifest_files(self, files: list) -> bool: return True return False + def check_file_count_limit(self, file_count: int) -> dict: + """ + Check if the given file count would exceed the system's file descriptor limit. + + Args: + file_count: Number of files to check + + Returns: + Dictionary with check results including recommendations + """ + return check_file_count_against_ulimit(file_count) + @staticmethod def to_case_insensitive_regex(input_string: str) -> str: """ @@ -300,7 +439,10 @@ def empty_head_scan_file() -> list[tuple[str, tuple[str, Union[BinaryIO, BytesIO @staticmethod def load_files_for_sending(files: List[str], workspace: str) -> List[Tuple[str, Tuple[str, BinaryIO]]]: """ - Prepares files for sending to the Socket API. + Prepares files for sending to the Socket API using lazy loading. + + This version uses lazy file loading to prevent "Too many open files" errors + when processing large numbers of manifest files. Args: files: List of file paths from find_files() @@ -310,25 +452,7 @@ def load_files_for_sending(files: List[str], workspace: str) -> List[Tuple[str, List of tuples formatted for requests multipart upload: [(field_name, (filename, file_object)), ...] """ - send_files = [] - if "\\" in workspace: - workspace = workspace.replace("\\", "/") - for file_path in files: - _, name = file_path.rsplit("/", 1) - - if file_path.startswith(workspace): - key = file_path[len(workspace):] - else: - key = file_path - - key = key.lstrip("/") - key = key.lstrip("./") - - f = open(file_path, 'rb') - payload = (key, (name.lstrip(workspace), f)) - send_files.append(payload) - - return send_files + return load_files_for_sending_lazy(files, workspace) def create_full_scan(self, files: list[tuple[str, tuple[str, BytesIO]]], params: FullScanParams) -> FullScan: """ @@ -356,6 +480,85 @@ def create_full_scan(self, files: list[tuple[str, tuple[str, BytesIO]]], params: return full_scan + def create_full_scan_with_report_url( + self, + path: str, + params: FullScanParams, + no_change: bool = False, + save_files_list_path: str = None, + save_manifest_tar_path: str = None + ) -> dict: + """Create a new full scan and return with html_report_url. + + Args: + path: Path to look for manifest files + params: Query params for the Full Scan endpoint + no_change: If True, return empty result + save_files_list_path: Optional path to save submitted files list for debugging + save_manifest_tar_path: Optional path to save manifest files tar.gz archive + + Returns: + Dict with full scan data including html_report_url + """ + log.debug(f"starting create_full_scan_with_report_url with no_change: {no_change}") + if no_change: + return { + "id": "NO_SCAN_RAN", + "html_report_url": "", + "unmatchedFiles": [] + } + + # Find manifest files + files = self.find_files(path) + + # Save submitted files list if requested + if save_files_list_path and files: + self.save_submitted_files_list(files, save_files_list_path) + + # Save manifest tar.gz if requested + if save_manifest_tar_path and files: + self.save_manifest_tar(files, save_manifest_tar_path, path) + + files_for_sending = self.load_files_for_sending(files, path) + if not files: + return { + "id": "NO_SCAN_RAN", + "html_report_url": "", + "unmatchedFiles": [] + } + + try: + # Create new scan + new_scan_start = time.time() + new_full_scan = self.create_full_scan(files_for_sending, params) + new_scan_end = time.time() + log.info(f"Total time to create new full scan: {new_scan_end - new_scan_start:.2f}") + except APIFailure as e: + log.error(f"Failed to create full scan: {e}") + raise + + # Construct report URL + base_socket = "https://socket.dev/dashboard/org" + report_url = f"{base_socket}/{self.config.org_slug}/sbom/{new_full_scan.id}" + if not params.include_license_details: + report_url += "?include_license_details=false" + + # Return result in the format expected by the user + return { + "id": new_full_scan.id, + "created_at": new_full_scan.created_at, + "updated_at": new_full_scan.updated_at, + "organization_id": new_full_scan.organization_id, + "repository_id": new_full_scan.repository_id, + "branch": new_full_scan.branch, + "commit_message": new_full_scan.commit_message, + "commit_hash": new_full_scan.commit_hash, + "pull_request": new_full_scan.pull_request, + "committers": new_full_scan.committers, + "html_report_url": report_url, + "unmatchedFiles": getattr(new_full_scan, 'unmatchedFiles', []) + } + def check_full_scans_status(self, head_full_scan_id: str, new_full_scan_id: str) -> bool: is_ready = False current_timeout = self.config.timeout @@ -656,7 +859,9 @@ def create_new_diff( self, path: str, params: FullScanParams, - no_change: bool = False + no_change: bool = False, + save_files_list_path: str = None, + save_manifest_tar_path: str = None ) -> Diff: """Create a new diff using the Socket SDK. @@ -664,16 +869,27 @@ def create_new_diff( path: Path to look for manifest files params: Query params for the Full Scan endpoint no_change: If True, return empty diff + save_files_list_path: Optional path to save submitted files list for debugging + save_manifest_tar_path: Optional path to save manifest files tar.gz archive """ log.debug(f"starting create_new_diff with no_change: {no_change}") if no_change: - return Diff(id="no_diff_id", diff_url="", report_url="") + return Diff(id="NO_DIFF_RAN", diff_url="", report_url="") # Find manifest files files = self.find_files(path) + + # Save submitted files list if requested + if save_files_list_path and files: + self.save_submitted_files_list(files, save_files_list_path) + + # Save manifest tar.gz if requested + if save_manifest_tar_path and files: + self.save_manifest_tar(files, save_manifest_tar_path, path) + files_for_sending = self.load_files_for_sending(files, path) if not files: - return Diff(id="no_diff_id", diff_url="", report_url="") + return Diff(id="NO_DIFF_RAN", diff_url="", report_url="") try: # Get head scan ID @@ -809,12 +1025,6 @@ def create_diff_report( return diff - def get_all_scores(self, packages: dict[str, Package]) -> dict[str, Package]: - components = [] - for package_id in packages: - package = packages[package_id] - return packages - def create_purl(self, package_id: str, packages: dict[str, Package]) -> Purl: """ Creates the extended PURL data for package identification and tracking. diff --git a/socketsecurity/core/lazy_file_loader.py b/socketsecurity/core/lazy_file_loader.py new file mode 100644 index 0000000..9127652 --- /dev/null +++ b/socketsecurity/core/lazy_file_loader.py @@ -0,0 +1,165 @@ +""" +Lazy file loading utilities for efficient manifest file processing. +""" +import logging +from typing import List, Tuple, Union, BinaryIO +from io import BytesIO +import os + +log = logging.getLogger("socketdev") + + +class LazyFileLoader: + """ + A file-like object that only opens the actual file when needed for reading. + This prevents keeping too many file descriptors open simultaneously. + + This class implements the standard file-like interface that requests library + expects for multipart uploads, making it a drop-in replacement for regular + file objects. + """ + + def __init__(self, file_path: str, name: str): + self.file_path = file_path + self.name = name + self._file = None + self._closed = False + self._position = 0 + + def _ensure_open(self): + """Ensure the file is open and seek to the correct position.""" + if self._closed: + raise ValueError("I/O operation on closed file.") + + if self._file is None: + self._file = open(self.file_path, 'rb') + log.debug(f"Opened file for reading: {self.file_path}") + # Seek to the current position if we've been reading before + if self._position > 0: + self._file.seek(self._position) + + def read(self, size: int = -1): + """Read from the file, opening it if needed.""" + self._ensure_open() + data = self._file.read(size) + self._position = self._file.tell() + return data + + def readline(self, size: int = -1): + """Read a line from the file.""" + self._ensure_open() + data = self._file.readline(size) + self._position = self._file.tell() + return data + + def seek(self, offset: int, whence: int = 0): + """Seek to a position in the file.""" + if self._closed: + raise ValueError("I/O operation on closed file.") + + # Calculate new position for tracking + if whence == 0: # SEEK_SET + self._position = offset + elif whence == 1: # SEEK_CUR + self._position += offset + elif whence == 2: # SEEK_END + # We need to open the file to get its size + self._ensure_open() + result = self._file.seek(offset, whence) + self._position = self._file.tell() + return result + + # If file is already open, seek it too + if self._file is not None: + result = self._file.seek(self._position) + return result + + return self._position + + def tell(self): + """Return current file position.""" + if self._closed: + raise ValueError("I/O operation on closed file.") + + if self._file is not None: + self._position = self._file.tell() + + return self._position + + def close(self): + """Close the file if it was opened.""" + if self._file is not None: + self._file.close() + log.debug(f"Closed file: {self.file_path}") + self._file = None + self._closed = True + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + + @property + def closed(self): + """Check if the file is closed.""" + return self._closed + + @property + def mode(self): + """Return the file mode.""" + return 'rb' + + def readable(self): + """Return whether the file is readable.""" + return not self._closed + + def writable(self): + """Return whether the file is writable.""" + return False + + def seekable(self): + """Return whether the file supports seeking.""" + return True + + +def load_files_for_sending_lazy(files: List[str], workspace: str) -> List[Tuple[str, Tuple[str, LazyFileLoader]]]: + """ + Prepares files for sending to the Socket API using lazy loading. + + This version doesn't open all files immediately, instead it creates + LazyFileLoader objects that only open files when they're actually read. + This prevents "Too many open files" errors when dealing with large numbers + of manifest files. + + Args: + files: List of file paths from find_files() + workspace: Base directory path to make paths relative to + + Returns: + List of tuples formatted for requests multipart upload: + [(field_name, (filename, lazy_file_object)), ...] + """ + send_files = [] + if "\\" in workspace: + workspace = workspace.replace("\\", "/") + + for file_path in files: + _, name = file_path.rsplit("/", 1) + + if file_path.startswith(workspace): + key = file_path[len(workspace):] + else: + key = file_path + + key = key.lstrip("/") + key = key.lstrip("./") + + # Create lazy file loader instead of opening file immediately + # Use the relative path (key) as filename instead of truncated basename + lazy_file = LazyFileLoader(file_path, key) + payload = (key, (key, lazy_file)) + send_files.append(payload) + + log.debug(f"Prepared {len(send_files)} files for lazy loading") + return send_files diff --git a/socketsecurity/core/resource_utils.py b/socketsecurity/core/resource_utils.py new file mode 100644 index 0000000..2652bbf --- /dev/null +++ b/socketsecurity/core/resource_utils.py @@ -0,0 +1,58 @@ +""" +System resource utilities for the Socket Security CLI. +""" +import resource +import logging + +log = logging.getLogger("socketdev") + + +def get_file_descriptor_limit(): + """ + Get the current file descriptor limit (equivalent to ulimit -n) + + Returns: + tuple: (soft_limit, hard_limit) or (None, None) if error + """ + try: + soft_limit, hard_limit = resource.getrlimit(resource.RLIMIT_NOFILE) + return soft_limit, hard_limit + except OSError as e: + log.error(f"Error getting file descriptor limit: {e}") + return None, None + + +def check_file_count_against_ulimit(file_count, buffer_size=100): + """ + Check if the number of files would exceed the file descriptor limit + + Args: + file_count (int): Number of files to check + buffer_size (int): Safety buffer to leave for other file operations + + Returns: + dict: Information about the check + """ + soft_limit, hard_limit = get_file_descriptor_limit() + + if soft_limit is None: + return { + "can_check": False, + "error": "Could not determine file descriptor limit", + "safe_to_process": True # Assume safe if we can't check + } + + available_fds = soft_limit - buffer_size + would_exceed = file_count > available_fds + + return { + "can_check": True, + "file_count": file_count, + "soft_limit": soft_limit, + "hard_limit": hard_limit, + "available_fds": available_fds, + "would_exceed": would_exceed, + "safe_to_process": not would_exceed, + "buffer_size": buffer_size, + "recommendation": "Consider processing files in batches or increasing ulimit" if would_exceed else "Safe to process all files" + } diff --git a/socketsecurity/socketcli.py b/socketsecurity/socketcli.py index fc7570b..97902b7 100644 --- a/socketsecurity/socketcli.py +++ b/socketsecurity/socketcli.py @@ -75,19 +75,49 @@ def main_code(): log.debug("loaded client") core = Core(socket_config, sdk) log.debug("loaded core") - # Load files - files defaults to "[]" in CliConfig + # Parse files argument try: - files = json.loads(config.files) # Will always succeed with empty list by default - is_repo = True # FIXME: This is misleading - JSON parsing success doesn't indicate repo status + if isinstance(config.files, list): + # Already a list, use as-is + specified_files = config.files + elif isinstance(config.files, str): + # Handle different string formats + files_str = config.files.strip() + + # If the string is wrapped in extra quotes, strip them + if ((files_str.startswith('"') and files_str.endswith('"')) or + (files_str.startswith("'") and files_str.endswith("'"))): + # Check if the inner content looks like JSON + inner_str = files_str[1:-1] + if inner_str.startswith('[') and inner_str.endswith(']'): + files_str = inner_str + + # Try to parse as JSON + try: + specified_files = json.loads(files_str) + except json.JSONDecodeError: + # If JSON parsing fails, try replacing single quotes with double quotes + files_str = files_str.replace("'", '"') + specified_files = json.loads(files_str) + else: + # Default to empty list + specified_files = [] except Exception as error: - # Only hits this if files was manually set to invalid JSON - log.error(f"Unable to parse {config.files}") - log.error(error) + log.error(f"Unable to parse files argument: {config.files}") + log.error(f"Error details: {error}") + log.debug(f"Files type: {type(config.files)}") + log.debug(f"Files repr: {repr(config.files)}") sys.exit(3) + # Determine if files were explicitly specified + files_explicitly_specified = config.files != "[]" and len(specified_files) > 0 + # Git setup + is_repo = False + git_repo = None try: git_repo = Git(config.target_path) + is_repo = True if not config.repo: config.repo = git_repo.repo_name if not config.commit_sha: @@ -98,12 +128,10 @@ def main_code(): config.committers = [git_repo.committer] if not config.commit_message: config.commit_message = git_repo.commit_message - if files and not config.ignore_commit_files: # files is empty by default, so this is False unless files manually specified - files = git_repo.changed_files # Only gets git's changed files if files were manually specified - is_repo = True # Redundant since already True except InvalidGitRepositoryError: - is_repo = False # Overwrites previous True - this is the REAL repo status - config.ignore_commit_files = True # Silently changes config - should log this + is_repo = False + log.debug("Not a git repository, setting ignore_commit_files=True") + config.ignore_commit_files = True except NoSuchPathError: raise Exception(f"Unable to find path {config.target_path}") @@ -125,26 +153,43 @@ def main_code(): if scm is not None: config.default_branch = scm.config.is_default_branch + # Determine files to check based on the new logic + files_to_check = [] + force_api_mode = False + + if files_explicitly_specified: + # Case 2: Files are specified - use them and don't check commit details + files_to_check = specified_files + log.debug(f"Using explicitly specified files: {files_to_check}") + elif not config.ignore_commit_files and is_repo: + # Case 1: Files not specified and --ignore-commit-files not set - try to find changed files from commit + files_to_check = git_repo.changed_files + log.debug(f"Using changed files from commit: {files_to_check}") + else: + # ignore_commit_files is set or not a repo - scan everything but force API mode if no supported files + files_to_check = [] + log.debug("No files to check from commit (ignore_commit_files=True or not a repo)") - # Combine manually specified files with git changes if applicable - files_to_check = set(json.loads(config.files)) # Start with manually specified files - - # Add git changes if this is a repo and we're not ignoring commit files - if is_repo and not config.ignore_commit_files and not files_to_check: - files_to_check.update(git_repo.changed_files) - - # Determine if we need to scan based on manifest files - should_skip_scan = True # Default to skipping - if config.ignore_commit_files: - should_skip_scan = False # Force scan if ignoring commit files - elif files_to_check: # If we have any files to check - should_skip_scan = not core.has_manifest_files(list(files_to_check)) - log.debug(f"in elif, should_skip_scan: {should_skip_scan}") - - if should_skip_scan: - log.debug("No manifest files found in changes, skipping scan") + # Check if we have supported manifest files + has_supported_files = files_to_check and core.has_manifest_files(files_to_check) + + # Case 3: If no supported files or files are empty, force API mode (no PR comments) + if not has_supported_files: + force_api_mode = True + log.debug("No supported manifest files found, forcing API mode") + + # Determine scan behavior + should_skip_scan = False # Always perform scan, but behavior changes based on supported files + if config.ignore_commit_files and not files_explicitly_specified: + # Force full scan when ignoring commit files and no explicit files + should_skip_scan = False + log.debug("Forcing full scan due to ignore_commit_files") + elif not has_supported_files: + # No supported files - still scan but in API mode + should_skip_scan = False + log.debug("No supported files but will scan in API mode") else: - log.debug("Found manifest files or forced scan, proceeding") + log.debug("Found supported manifest files, proceeding with normal scan") org_slug = core.config.org_slug if config.repo_is_public: @@ -177,6 +222,8 @@ def main_code(): # Initialize diff diff = Diff() diff.id = "NO_DIFF_RAN" + diff.diff_url = "" + diff.report_url = "" # Handle SCM-specific flows if scm is not None and scm.check_event_type() == "comment": @@ -192,13 +239,11 @@ def main_code(): log.debug("Removing comment alerts") scm.remove_comment_alerts(comments) - elif scm is not None and scm.check_event_type() != "comment": + elif scm is not None and scm.check_event_type() != "comment" and not force_api_mode: log.info("Push initiated flow") - if should_skip_scan: - log.info("No manifest files changes, skipping scan") - elif scm.check_event_type() == "diff": + if scm.check_event_type() == "diff": log.info("Starting comment logic for PR/MR event") - diff = core.create_new_diff(config.target_path, params, no_change=should_skip_scan) + diff = core.create_new_diff(config.target_path, params, no_change=should_skip_scan, save_files_list_path=config.save_submitted_files_list, save_manifest_tar_path=config.save_manifest_tar) comments = scm.get_comments_for_pr() log.debug("Removing comment alerts") @@ -251,16 +296,28 @@ def main_code(): ) else: log.info("Starting non-PR/MR flow") - diff = core.create_new_diff(config.target_path, params, no_change=should_skip_scan) + diff = core.create_new_diff(config.target_path, params, no_change=should_skip_scan, save_files_list_path=config.save_submitted_files_list, save_manifest_tar_path=config.save_manifest_tar) output_handler.handle_output(diff) else: - log.info("API Mode") - diff = core.create_new_diff(config.target_path, params, no_change=should_skip_scan) - output_handler.handle_output(diff) + if force_api_mode: + log.info("No Manifest files changed, creating Socket Report") + else: + log.info("API Mode") + full_scan_result = core.create_full_scan_with_report_url(config.target_path, params, no_change=should_skip_scan, save_files_list_path=config.save_submitted_files_list, save_manifest_tar_path=config.save_manifest_tar) + log.info(f"Full scan created with ID: {full_scan_result['id']}") + log.info(f"Full scan report URL: {full_scan_result['html_report_url']}") + + # Create a minimal diff-like object for compatibility with downstream code + diff = Diff() + diff.id = full_scan_result['id'] + diff.report_url = full_scan_result['html_report_url'] + diff.diff_url = full_scan_result['html_report_url'] + diff.packages = {} # No package data needed for API mode + # No output handling needed for API mode - just creating the scan # Handle license generation - if not should_skip_scan and diff.id != "no_diff_id" and config.generate_license: + if not should_skip_scan and diff.id != "NO_DIFF_RAN" and diff.id != "NO_SCAN_RAN" and config.generate_license: all_packages = {} for purl in diff.packages: package = diff.packages[purl] @@ -279,6 +336,11 @@ def main_code(): all_packages[package.id] = output core.save_file(config.license_file_name, json.dumps(all_packages)) + # If we forced API mode due to no supported files, behave as if --disable-blocking was set + if force_api_mode and not config.disable_blocking: + log.debug("Temporarily enabling disable_blocking due to no supported manifest files") + config.disable_blocking = True + sys.exit(output_handler.return_exit_code(diff))