Skip to content

Commit 50fe8f1

Browse files
authored
Doug/fix diff scan license options (#40)
* Fixed old diff scan endpoint to correctly use the include_license_details flag and default to true * Bumped version * Fixed license attributes that had changed for artifacts * feat: implement lazy file loading to prevent 'too many open files' errors - Add FileDescriptorManager singleton to track and limit open file descriptors - Implement LazyFileLoader class that opens files only when needed for reading - Add configurable max_open_files parameter to fullscans.post() and diffscans.create_from_repo() - Auto-close files when fully read and use LRU eviction when limit reached - Add comprehensive documentation with v3.0 migration notes - Maintain backward compatibility with use_lazy_loading=False default - Support cross-platform operation (Unix/Linux/macOS/Windows) - Include retry logic with garbage collection for edge cases This prevents file descriptor exhaustion when uploading large numbers of manifest files (e.g., 1956 files) on systems with low ulimit values.
1 parent 88a2d96 commit 50fe8f1

File tree

7 files changed

+358
-81
lines changed

7 files changed

+358
-81
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,5 @@ dist
1212
*.build
1313
*.dist
1414
*.egg-info
15-
*.cpython-312.pyc
15+
*.cpython-312.pyc
16+
example-socket-export.py

socketdev/dependencies/__init__.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from urllib.parse import urlencode
33
import logging
44
from socketdev.tools import load_files
5+
from ..utils import Utils
56

67
log = logging.getLogger("socketdev")
78

@@ -12,9 +13,13 @@ class Dependencies:
1213
def __init__(self, api):
1314
self.api = api
1415

15-
def post(self, files: list, params: dict) -> dict:
16-
loaded_files = []
17-
loaded_files = load_files(files, loaded_files)
16+
def post(self, files: list, params: dict, use_lazy_loading: bool = False, workspace: str = None) -> dict:
17+
if use_lazy_loading:
18+
loaded_files = Utils.load_files_for_sending_lazy(files, workspace)
19+
else:
20+
loaded_files = []
21+
loaded_files = load_files(files, loaded_files)
22+
1823
path = "dependencies/upload?" + urlencode(params)
1924
response = self.api.do_request(path=path, files=loaded_files, method="POST")
2025
if response.status_code == 200:

socketdev/diffscans/__init__.py

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import json
22
import logging
33
from typing import Any, Dict, Optional, Union
4+
from ..utils import Utils
45

56
log = logging.getLogger("socketdev")
67

@@ -29,13 +30,44 @@ def get(self, org_slug: str, diff_scan_id: str) -> dict:
2930
log.error(f"Error fetching diff scan: {response.status_code}, message: {response.text}")
3031
return {}
3132

32-
def create_from_repo(self, org_slug: str, repo_slug: str, files: list, params: Optional[Dict[str, Any]] = None) -> dict:
33-
"""Create a diff scan from repo HEAD, uploading files as multipart form data."""
33+
def create_from_repo(self, org_slug: str, repo_slug: str, files: list, params: Optional[Dict[str, Any]] = None, use_lazy_loading: bool = False, workspace: str = None, max_open_files: int = 100) -> dict:
34+
"""
35+
Create a diff scan from repo HEAD, uploading files as multipart form data.
36+
37+
Args:
38+
org_slug: Organization slug
39+
repo_slug: Repository slug
40+
files: List of file paths to upload for scanning
41+
params: Optional query parameters for the request
42+
use_lazy_loading: Whether to use lazy file loading to prevent "too many open files"
43+
errors when uploading large numbers of files (default: False)
44+
NOTE: In version 3.0, this will default to True for better performance
45+
workspace: Base directory path to make file paths relative to
46+
max_open_files: Maximum number of files to keep open simultaneously when using
47+
lazy loading. Useful for systems with low ulimit values (default: 100)
48+
49+
Returns:
50+
dict: API response containing diff scan results
51+
52+
Note:
53+
When use_lazy_loading=True, files are opened only when needed during upload,
54+
preventing file descriptor exhaustion. The max_open_files parameter controls how many
55+
files can be open simultaneously - set this lower on systems with restrictive ulimits.
56+
57+
For large file uploads (>100 files), it's recommended to set use_lazy_loading=True.
58+
"""
3459
import urllib.parse
3560
path = f"orgs/{org_slug}/diff-scans/from-repo/{repo_slug}"
3661
if params:
3762
path += "?" + urllib.parse.urlencode(params)
38-
response = self.api.do_request(path=path, method="POST", files=files)
63+
64+
# Use lazy loading if requested
65+
if use_lazy_loading:
66+
prepared_files = Utils.load_files_for_sending_lazy(files, workspace, max_open_files)
67+
else:
68+
prepared_files = files
69+
70+
response = self.api.do_request(path=path, method="POST", files=prepared_files)
3971
if response.status_code in (200, 201):
4072
return response.json()
4173
log.error(f"Error creating diff scan from repo: {response.status_code}, message: {response.text}")

socketdev/fullscans/__init__.py

Lines changed: 39 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -343,11 +343,9 @@ def from_dict(cls, data: dict) -> "LicenseMatch":
343343
@dataclass
344344
class LicenseDetail:
345345
authors: List[str]
346-
charEnd: int
347-
charStart: int
346+
errorData: str
348347
filepath: str
349348
match_strength: int
350-
filehash: str
351349
provenance: str
352350
spdxDisj: List[List[LicenseMatch]]
353351

@@ -360,14 +358,13 @@ def to_dict(self):
360358
@classmethod
361359
def from_dict(cls, data: dict) -> "LicenseDetail":
362360
return cls(
361+
spdxDisj=data["spdxDisj"],
363362
authors=data["authors"],
364-
charEnd=data["charEnd"],
365-
charStart=data["charStart"],
363+
errorData=data["errorData"],
364+
provenance=data["provenance"],
366365
filepath=data["filepath"],
367366
match_strength=data["match_strength"],
368-
filehash=data["filehash"],
369-
provenance=data["provenance"],
370-
spdxDisj=[[LicenseMatch.from_dict(match) for match in group] for group in data["spdxDisj"]],
367+
371368
)
372369

373370

@@ -723,15 +720,45 @@ def get(self, org_slug: str, params: dict, use_types: bool = False) -> Union[dic
723720
)
724721
return {}
725722

726-
def post(self, files: list, params: FullScanParams, use_types: bool = False) -> Union[dict, CreateFullScanResponse]:
723+
def post(self, files: list, params: FullScanParams, use_types: bool = False, use_lazy_loading: bool = False, workspace: str = None, max_open_files: int = 100) -> Union[dict, CreateFullScanResponse]:
724+
"""
725+
Create a new full scan by uploading manifest files.
726+
727+
Args:
728+
files: List of file paths to upload for scanning
729+
params: FullScanParams object containing scan configuration
730+
use_types: Whether to return typed response objects (default: False)
731+
use_lazy_loading: Whether to use lazy file loading to prevent "too many open files"
732+
errors when uploading large numbers of files (default: False)
733+
NOTE: In version 3.0, this will default to True for better performance
734+
workspace: Base directory path to make file paths relative to
735+
max_open_files: Maximum number of files to keep open simultaneously when using
736+
lazy loading. Useful for systems with low ulimit values (default: 100)
737+
738+
Returns:
739+
dict or CreateFullScanResponse: API response containing scan results
740+
741+
Note:
742+
When use_lazy_loading=True, files are opened only when needed during upload,
743+
preventing file descriptor exhaustion. The max_open_files parameter controls how many
744+
files can be open simultaneously - set this lower on systems with restrictive ulimits.
745+
746+
For large file uploads (>100 files), it's recommended to set use_lazy_loading=True.
747+
"""
727748
Utils.validate_integration_type(params.integration_type if params.integration_type else "api")
728749
org_slug = str(params.org_slug)
729750
params_dict = params.to_dict()
730751
params_dict.pop("org_slug")
731752
params_arg = urllib.parse.urlencode(params_dict)
732753
path = "orgs/" + org_slug + "/full-scans?" + str(params_arg)
733754

734-
response = self.api.do_request(path=path, method="POST", files=files)
755+
# Use lazy loading if requested
756+
if use_lazy_loading:
757+
prepared_files = Utils.load_files_for_sending_lazy(files, workspace, max_open_files)
758+
else:
759+
prepared_files = files
760+
761+
response = self.api.do_request(path=path, method="POST", files=prepared_files)
735762

736763
if response.status_code == 201:
737764
result = response.json()
@@ -766,10 +793,10 @@ def stream_diff(
766793
before: str,
767794
after: str,
768795
use_types: bool = True,
769-
include_license_details: bool = False,
796+
include_license_details: str = "true",
770797
**kwargs,
771798
) -> Union[dict, StreamDiffResponse]:
772-
path = f"orgs/{org_slug}/full-scans/diff?before={before}&after={after}&{include_license_details}"
799+
path = f"orgs/{org_slug}/full-scans/diff?before={before}&after={after}&include_license_details={include_license_details}"
773800
if kwargs:
774801
for key, value in kwargs.items():
775802
path += f"&{key}={value}"

0 commit comments

Comments
 (0)