Skip to content

Refactor file detection and scanning logic to fix commit file handling #101

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 23, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ build-backend = "hatchling.build"

[project]
name = "socketsecurity"
version = "2.1.21"
version = "2.1.22"
requires-python = ">= 3.10"
license = {"file" = "LICENSE"}
dependencies = [
Expand Down
2 changes: 1 addition & 1 deletion socketsecurity/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
__author__ = 'socket.dev'
__version__ = '2.1.21'
__version__ = '2.1.22'
79 changes: 69 additions & 10 deletions socketsecurity/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from io import BytesIO
from pathlib import PurePath
from typing import BinaryIO, Dict, List, Tuple, Set, Union
import re
from socketdev import socketdev
from socketdev.exceptions import APIFailure
from socketdev.fullscans import FullScanParams, SocketArtifact
Expand Down Expand Up @@ -196,7 +195,7 @@ def find_files(self, path: str) -> List[str]:
for ecosystem in patterns:
if ecosystem in self.config.excluded_ecosystems:
continue
log.info(f'Scanning ecosystem: {ecosystem}')
log.debug(f'Scanning ecosystem: {ecosystem}')
ecosystem_patterns = patterns[ecosystem]
for file_name in ecosystem_patterns:
original_pattern = ecosystem_patterns[file_name]["pattern"]
Expand Down Expand Up @@ -356,6 +355,72 @@ def create_full_scan(self, files: list[tuple[str, tuple[str, BytesIO]]], params:

return full_scan

def create_full_scan_with_report_url(
self,
path: str,
params: FullScanParams,
no_change: bool = False
) -> dict:
"""Create a new full scan and return with html_report_url.

Args:
path: Path to look for manifest files
params: Query params for the Full Scan endpoint
no_change: If True, return empty result

Returns:
Dict with full scan data including html_report_url
"""
log.debug(f"starting create_full_scan_with_report_url with no_change: {no_change}")
if no_change:
return {
"id": "NO_SCAN_RAN",
"html_report_url": "",
"unmatchedFiles": []
}

# Find manifest files
files = self.find_files(path)
files_for_sending = self.load_files_for_sending(files, path)
if not files:
return {
"id": "NO_SCAN_RAN",
"html_report_url": "",
"unmatchedFiles": []
}

try:
# Create new scan
new_scan_start = time.time()
new_full_scan = self.create_full_scan(files_for_sending, params)
new_scan_end = time.time()
log.info(f"Total time to create new full scan: {new_scan_end - new_scan_start:.2f}")
except APIFailure as e:
log.error(f"Failed to create full scan: {e}")
raise

# Construct report URL
base_socket = "https://socket.dev/dashboard/org"
report_url = f"{base_socket}/{self.config.org_slug}/sbom/{new_full_scan.id}"
if not params.include_license_details:
report_url += "?include_license_details=false"

# Return result in the format expected by the user
return {
"id": new_full_scan.id,
"created_at": new_full_scan.created_at,
"updated_at": new_full_scan.updated_at,
"organization_id": new_full_scan.organization_id,
"repository_id": new_full_scan.repository_id,
"branch": new_full_scan.branch,
"commit_message": new_full_scan.commit_message,
"commit_hash": new_full_scan.commit_hash,
"pull_request": new_full_scan.pull_request,
"committers": new_full_scan.committers,
"html_report_url": report_url,
"unmatchedFiles": getattr(new_full_scan, 'unmatchedFiles', [])
}

def check_full_scans_status(self, head_full_scan_id: str, new_full_scan_id: str) -> bool:
is_ready = False
current_timeout = self.config.timeout
Expand Down Expand Up @@ -667,13 +732,13 @@ def create_new_diff(
"""
log.debug(f"starting create_new_diff with no_change: {no_change}")
if no_change:
return Diff(id="no_diff_id", diff_url="", report_url="")
return Diff(id="NO_DIFF_RAN", diff_url="", report_url="")

# Find manifest files
files = self.find_files(path)
files_for_sending = self.load_files_for_sending(files, path)
if not files:
return Diff(id="no_diff_id", diff_url="", report_url="")
return Diff(id="NO_DIFF_RAN", diff_url="", report_url="")

try:
# Get head scan ID
Expand Down Expand Up @@ -809,12 +874,6 @@ def create_diff_report(

return diff

def get_all_scores(self, packages: dict[str, Package]) -> dict[str, Package]:
components = []
for package_id in packages:
package = packages[package_id]
return packages

def create_purl(self, package_id: str, packages: dict[str, Package]) -> Purl:
"""
Creates the extended PURL data for package identification and tracking.
Expand Down
136 changes: 99 additions & 37 deletions socketsecurity/socketcli.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,19 +75,49 @@ def main_code():
log.debug("loaded client")
core = Core(socket_config, sdk)
log.debug("loaded core")
# Load files - files defaults to "[]" in CliConfig
# Parse files argument
try:
files = json.loads(config.files) # Will always succeed with empty list by default
is_repo = True # FIXME: This is misleading - JSON parsing success doesn't indicate repo status
if isinstance(config.files, list):
# Already a list, use as-is
specified_files = config.files
elif isinstance(config.files, str):
# Handle different string formats
files_str = config.files.strip()

# If the string is wrapped in extra quotes, strip them
if ((files_str.startswith('"') and files_str.endswith('"')) or
(files_str.startswith("'") and files_str.endswith("'"))):
# Check if the inner content looks like JSON
inner_str = files_str[1:-1]
if inner_str.startswith('[') and inner_str.endswith(']'):
files_str = inner_str

# Try to parse as JSON
try:
specified_files = json.loads(files_str)
except json.JSONDecodeError:
# If JSON parsing fails, try replacing single quotes with double quotes
files_str = files_str.replace("'", '"')
specified_files = json.loads(files_str)
else:
# Default to empty list
specified_files = []
except Exception as error:
# Only hits this if files was manually set to invalid JSON
log.error(f"Unable to parse {config.files}")
log.error(error)
log.error(f"Unable to parse files argument: {config.files}")
log.error(f"Error details: {error}")
log.debug(f"Files type: {type(config.files)}")
log.debug(f"Files repr: {repr(config.files)}")
sys.exit(3)

# Determine if files were explicitly specified
files_explicitly_specified = config.files != "[]" and len(specified_files) > 0

# Git setup
is_repo = False
git_repo = None
try:
git_repo = Git(config.target_path)
is_repo = True
if not config.repo:
config.repo = git_repo.repo_name
if not config.commit_sha:
Expand All @@ -98,12 +128,10 @@ def main_code():
config.committers = [git_repo.committer]
if not config.commit_message:
config.commit_message = git_repo.commit_message
if files and not config.ignore_commit_files: # files is empty by default, so this is False unless files manually specified
files = git_repo.changed_files # Only gets git's changed files if files were manually specified
is_repo = True # Redundant since already True
except InvalidGitRepositoryError:
is_repo = False # Overwrites previous True - this is the REAL repo status
config.ignore_commit_files = True # Silently changes config - should log this
is_repo = False
log.debug("Not a git repository, setting ignore_commit_files=True")
config.ignore_commit_files = True
except NoSuchPathError:
raise Exception(f"Unable to find path {config.target_path}")

Expand All @@ -125,26 +153,43 @@ def main_code():
if scm is not None:
config.default_branch = scm.config.is_default_branch

# Determine files to check based on the new logic
files_to_check = []
force_api_mode = False

if files_explicitly_specified:
# Case 2: Files are specified - use them and don't check commit details
files_to_check = specified_files
log.debug(f"Using explicitly specified files: {files_to_check}")
elif not config.ignore_commit_files and is_repo:
# Case 1: Files not specified and --ignore-commit-files not set - try to find changed files from commit
files_to_check = git_repo.changed_files
log.debug(f"Using changed files from commit: {files_to_check}")
else:
# ignore_commit_files is set or not a repo - scan everything but force API mode if no supported files
files_to_check = []
log.debug("No files to check from commit (ignore_commit_files=True or not a repo)")

# Combine manually specified files with git changes if applicable
files_to_check = set(json.loads(config.files)) # Start with manually specified files

# Add git changes if this is a repo and we're not ignoring commit files
if is_repo and not config.ignore_commit_files and not files_to_check:
files_to_check.update(git_repo.changed_files)

# Determine if we need to scan based on manifest files
should_skip_scan = True # Default to skipping
if config.ignore_commit_files:
should_skip_scan = False # Force scan if ignoring commit files
elif files_to_check: # If we have any files to check
should_skip_scan = not core.has_manifest_files(list(files_to_check))
log.debug(f"in elif, should_skip_scan: {should_skip_scan}")

if should_skip_scan:
log.debug("No manifest files found in changes, skipping scan")
# Check if we have supported manifest files
has_supported_files = files_to_check and core.has_manifest_files(files_to_check)

# Case 3: If no supported files or files are empty, force API mode (no PR comments)
if not has_supported_files:
force_api_mode = True
log.debug("No supported manifest files found, forcing API mode")

# Determine scan behavior
should_skip_scan = False # Always perform scan, but behavior changes based on supported files
if config.ignore_commit_files and not files_explicitly_specified:
# Force full scan when ignoring commit files and no explicit files
should_skip_scan = False
log.debug("Forcing full scan due to ignore_commit_files")
elif not has_supported_files:
# No supported files - still scan but in API mode
should_skip_scan = False
log.debug("No supported files but will scan in API mode")
else:
log.debug("Found manifest files or forced scan, proceeding")
log.debug("Found supported manifest files, proceeding with normal scan")

org_slug = core.config.org_slug
if config.repo_is_public:
Expand Down Expand Up @@ -177,6 +222,8 @@ def main_code():
# Initialize diff
diff = Diff()
diff.id = "NO_DIFF_RAN"
diff.diff_url = ""
diff.report_url = ""

# Handle SCM-specific flows
if scm is not None and scm.check_event_type() == "comment":
Expand All @@ -192,11 +239,9 @@ def main_code():
log.debug("Removing comment alerts")
scm.remove_comment_alerts(comments)

elif scm is not None and scm.check_event_type() != "comment":
elif scm is not None and scm.check_event_type() != "comment" and not force_api_mode:
log.info("Push initiated flow")
if should_skip_scan:
log.info("No manifest files changes, skipping scan")
elif scm.check_event_type() == "diff":
if scm.check_event_type() == "diff":
log.info("Starting comment logic for PR/MR event")
diff = core.create_new_diff(config.target_path, params, no_change=should_skip_scan)
comments = scm.get_comments_for_pr()
Expand Down Expand Up @@ -255,12 +300,24 @@ def main_code():

output_handler.handle_output(diff)
else:
log.info("API Mode")
diff = core.create_new_diff(config.target_path, params, no_change=should_skip_scan)
output_handler.handle_output(diff)
if force_api_mode:
log.info("No Manifest files changed, creating Socket Report")
else:
log.info("API Mode")
full_scan_result = core.create_full_scan_with_report_url(config.target_path, params, no_change=should_skip_scan)
log.info(f"Full scan created with ID: {full_scan_result['id']}")
log.info(f"Full scan report URL: {full_scan_result['html_report_url']}")

# Create a minimal diff-like object for compatibility with downstream code
diff = Diff()
diff.id = full_scan_result['id']
diff.report_url = full_scan_result['html_report_url']
diff.diff_url = full_scan_result['html_report_url']
diff.packages = {} # No package data needed for API mode
# No output handling needed for API mode - just creating the scan

# Handle license generation
if not should_skip_scan and diff.id != "no_diff_id" and config.generate_license:
if not should_skip_scan and diff.id != "NO_DIFF_RAN" and diff.id != "NO_SCAN_RAN" and config.generate_license:
all_packages = {}
for purl in diff.packages:
package = diff.packages[purl]
Expand All @@ -279,6 +336,11 @@ def main_code():
all_packages[package.id] = output
core.save_file(config.license_file_name, json.dumps(all_packages))

# If we forced API mode due to no supported files, behave as if --disable-blocking was set
if force_api_mode and not config.disable_blocking:
log.debug("Temporarily enabling disable_blocking due to no supported manifest files")
config.disable_blocking = True

sys.exit(output_handler.return_exit_code(diff))


Expand Down
Loading