Skip to content

Commit 29bb083

Browse files
committed
loader: fix recycled TID overwriting matched calls in dynamic layout
When an OS recycles a thread ID (TID) within the same process, compute_dynamic_layout() was resetting calls_by_thread[t.address] to an empty list on the second encounter of that ThreadAddress. This erased the matched calls accumulated from the first thread instance. Those calls were still present in matched_calls (added by the rule engine), so the renderer could not locate them in the layout and raised ValueError("name not found for call"). Fix the overwrite by using setdefault() instead of direct assignment, and guard the threads_by_process.append() with a membership check so a recycled TID does not produce a duplicate thread entry in the layout. A dedicated unit test covering the recycled-TID scenario is added in tests/test_loader.py. Fixes #2619
1 parent e9b3311 commit 29bb083

File tree

3 files changed

+217
-30
lines changed

3 files changed

+217
-30
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
-
3333

3434
### Bug Fixes
35+
- loader: fix recycled TID overwriting matched calls in dynamic layout, causing ValueError during rendering @devs6186 #2619
3536
- main: suggest --os flag in unsupported OS error message to help users override ELF OS detection @devs6186 #2577
3637
- render: escape sample-controlled strings before passing to Rich to prevent MarkupError @devs6186 #2699
3738
- Fixed insecure deserialization vulnerability in YAML loading @0x1622 (#2770)

capa/loader.py

Lines changed: 99 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,11 @@
3131
import capa.features.extractors.common
3232
from capa.rules import RuleSet
3333
from capa.engine import MatchResults
34-
from capa.exceptions import UnsupportedOSError, UnsupportedArchError, UnsupportedFormatError
34+
from capa.exceptions import (
35+
UnsupportedOSError,
36+
UnsupportedArchError,
37+
UnsupportedFormatError,
38+
)
3539
from capa.features.common import (
3640
OS_AUTO,
3741
FORMAT_PE,
@@ -160,9 +164,13 @@ def get_workspace(path: Path, input_format: str, sigpaths: list[Path]):
160164
vw = viv_utils.getWorkspace(str(path), analyze=False, should_save=False)
161165
elif input_format == FORMAT_SC32:
162166
# these are not analyzed nor saved.
163-
vw = viv_utils.getShellcodeWorkspaceFromFile(str(path), arch="i386", analyze=False)
167+
vw = viv_utils.getShellcodeWorkspaceFromFile(
168+
str(path), arch="i386", analyze=False
169+
)
164170
elif input_format == FORMAT_SC64:
165-
vw = viv_utils.getShellcodeWorkspaceFromFile(str(path), arch="amd64", analyze=False)
171+
vw = viv_utils.getShellcodeWorkspaceFromFile(
172+
str(path), arch="amd64", analyze=False
173+
)
166174
else:
167175
raise ValueError("unexpected format: " + input_format)
168176
except envi.exc.SegmentationViolation as e:
@@ -231,20 +239,26 @@ def get_extractor(
231239
import capa.features.extractors.drakvuf.extractor
232240

233241
report = capa.helpers.load_jsonl_from_path(input_path)
234-
return capa.features.extractors.drakvuf.extractor.DrakvufExtractor.from_report(report)
242+
return capa.features.extractors.drakvuf.extractor.DrakvufExtractor.from_report(
243+
report
244+
)
235245

236246
elif backend == BACKEND_VMRAY:
237247
import capa.features.extractors.vmray.extractor
238248

239-
return capa.features.extractors.vmray.extractor.VMRayExtractor.from_zipfile(input_path)
249+
return capa.features.extractors.vmray.extractor.VMRayExtractor.from_zipfile(
250+
input_path
251+
)
240252

241253
elif backend == BACKEND_DOTNET:
242254
import capa.features.extractors.dnfile.extractor
243255

244256
if input_format not in (FORMAT_PE, FORMAT_DOTNET):
245257
raise UnsupportedFormatError()
246258

247-
return capa.features.extractors.dnfile.extractor.DnfileFeatureExtractor(input_path)
259+
return capa.features.extractors.dnfile.extractor.DnfileFeatureExtractor(
260+
input_path
261+
)
248262

249263
elif backend == BACKEND_BINJA:
250264
import capa.features.extractors.binja.find_binja_api as finder
@@ -303,11 +317,15 @@ def get_extractor(
303317
vw.saveWorkspace()
304318
except IOError:
305319
# see #168 for discussion around how to handle non-writable directories
306-
logger.info("source directory is not writable, won't save intermediate workspace")
320+
logger.info(
321+
"source directory is not writable, won't save intermediate workspace"
322+
)
307323
else:
308324
logger.debug("CAPA_SAVE_WORKSPACE unset, not saving workspace")
309325

310-
return capa.features.extractors.viv.extractor.VivisectFeatureExtractor(vw, input_path, os_)
326+
return capa.features.extractors.viv.extractor.VivisectFeatureExtractor(
327+
vw, input_path, os_
328+
)
311329

312330
elif backend == BACKEND_FREEZE:
313331
return frz.load(input_path.read_bytes())
@@ -320,7 +338,9 @@ def get_extractor(
320338
assert sample_path is not None
321339
buf = sample_path.read_bytes()
322340

323-
return capa.features.extractors.binexport2.extractor.BinExport2FeatureExtractor(be2, buf)
341+
return capa.features.extractors.binexport2.extractor.BinExport2FeatureExtractor(
342+
be2, buf
343+
)
324344

325345
elif backend == BACKEND_IDA:
326346
import capa.features.extractors.ida.idalib as idalib
@@ -351,7 +371,9 @@ def get_extractor(
351371
# -1 - Generic errors (database already open, auto-analysis failed, etc.)
352372
# -2 - User cancelled operation
353373
ret = idapro.open_database(
354-
str(input_path), run_auto_analysis=True, args="-Olumina:host=0.0.0.0 -Osecondary_lumina:host=0.0.0.0 -R"
374+
str(input_path),
375+
run_auto_analysis=True,
376+
args="-Olumina:host=0.0.0.0 -Osecondary_lumina:host=0.0.0.0 -R",
355377
)
356378
if ret != 0:
357379
raise RuntimeError("failed to analyze input file")
@@ -386,12 +408,19 @@ def get_extractor(
386408
monitor = TaskMonitor.DUMMY
387409

388410
# Import file
389-
loader = pyghidra.program_loader().project(project).source(str(input_path)).name(input_path.name)
411+
loader = (
412+
pyghidra.program_loader()
413+
.project(project)
414+
.source(str(input_path))
415+
.name(input_path.name)
416+
)
390417
with loader.load() as load_results:
391418
load_results.save(monitor)
392419

393420
# Open program
394-
program, consumer = pyghidra.consume_program(project, "/" + input_path.name)
421+
program, consumer = pyghidra.consume_program(
422+
project, "/" + input_path.name
423+
)
395424

396425
# Analyze
397426
pyghidra.analyze(program, monitor)
@@ -424,7 +453,9 @@ def __exit__(self, exc_type, exc_val, exc_tb):
424453

425454
import capa.features.extractors.ghidra.extractor
426455

427-
return capa.features.extractors.ghidra.extractor.GhidraFeatureExtractor(ctx_manager=cm, tmpdir=tmpdir)
456+
return capa.features.extractors.ghidra.extractor.GhidraFeatureExtractor(
457+
ctx_manager=cm, tmpdir=tmpdir
458+
)
428459
else:
429460
raise ValueError("unexpected backend: " + backend)
430461

@@ -461,37 +492,55 @@ def get_file_extractors(input_file: Path, input_format: str) -> list[FeatureExtr
461492
if input_format == FORMAT_PE:
462493
import capa.features.extractors.pefile
463494

464-
file_extractors.append(capa.features.extractors.pefile.PefileFeatureExtractor(input_file))
495+
file_extractors.append(
496+
capa.features.extractors.pefile.PefileFeatureExtractor(input_file)
497+
)
465498

466499
elif input_format == FORMAT_DOTNET:
467500
import capa.features.extractors.pefile
468501
import capa.features.extractors.dotnetfile
469502

470-
file_extractors.append(capa.features.extractors.pefile.PefileFeatureExtractor(input_file))
471-
file_extractors.append(capa.features.extractors.dotnetfile.DotnetFileFeatureExtractor(input_file))
503+
file_extractors.append(
504+
capa.features.extractors.pefile.PefileFeatureExtractor(input_file)
505+
)
506+
file_extractors.append(
507+
capa.features.extractors.dotnetfile.DotnetFileFeatureExtractor(input_file)
508+
)
472509

473510
elif input_format == FORMAT_ELF:
474511
import capa.features.extractors.elffile
475512

476-
file_extractors.append(capa.features.extractors.elffile.ElfFeatureExtractor(input_file))
513+
file_extractors.append(
514+
capa.features.extractors.elffile.ElfFeatureExtractor(input_file)
515+
)
477516

478517
elif input_format == FORMAT_CAPE:
479518
import capa.features.extractors.cape.extractor
480519

481520
report = capa.helpers.load_json_from_path(input_file)
482-
file_extractors.append(capa.features.extractors.cape.extractor.CapeExtractor.from_report(report))
521+
file_extractors.append(
522+
capa.features.extractors.cape.extractor.CapeExtractor.from_report(report)
523+
)
483524

484525
elif input_format == FORMAT_DRAKVUF:
485526
import capa.helpers
486527
import capa.features.extractors.drakvuf.extractor
487528

488529
report = capa.helpers.load_jsonl_from_path(input_file)
489-
file_extractors.append(capa.features.extractors.drakvuf.extractor.DrakvufExtractor.from_report(report))
530+
file_extractors.append(
531+
capa.features.extractors.drakvuf.extractor.DrakvufExtractor.from_report(
532+
report
533+
)
534+
)
490535

491536
elif input_format == FORMAT_VMRAY:
492537
import capa.features.extractors.vmray.extractor
493538

494-
file_extractors.append(capa.features.extractors.vmray.extractor.VMRayExtractor.from_zipfile(input_file))
539+
file_extractors.append(
540+
capa.features.extractors.vmray.extractor.VMRayExtractor.from_zipfile(
541+
input_file
542+
)
543+
)
495544

496545
elif input_format == FORMAT_BINEXPORT2:
497546
file_extractors = _get_binexport2_file_extractors(input_file)
@@ -501,7 +550,9 @@ def get_file_extractors(input_file: Path, input_format: str) -> list[FeatureExtr
501550

502551
def get_signatures(sigs_path: Path) -> list[Path]:
503552
if not sigs_path.exists():
504-
raise IOError(f"signatures path {sigs_path} does not exist or cannot be accessed")
553+
raise IOError(
554+
f"signatures path {sigs_path} does not exist or cannot be accessed"
555+
)
505556

506557
paths: list[Path] = []
507558
if sigs_path.is_file():
@@ -525,7 +576,9 @@ def get_signatures(sigs_path: Path) -> list[Path]:
525576
return paths
526577

527578

528-
def get_sample_analysis(format_, arch, os_, extractor, rules_path, feature_counts, library_functions):
579+
def get_sample_analysis(
580+
format_, arch, os_, extractor, rules_path, feature_counts, library_functions
581+
):
529582
if isinstance(extractor, StaticFeatureExtractor):
530583
return rdoc.StaticAnalysis(
531584
format=format_,
@@ -575,12 +628,20 @@ def collect_metadata(
575628
md5, sha1, sha256 = sample_hashes.md5, sample_hashes.sha1, sample_hashes.sha256
576629

577630
global_feats = list(extractor.extract_global_features())
578-
extractor_format = [f.value for (f, _) in global_feats if isinstance(f, capa.features.common.Format)]
579-
extractor_arch = [f.value for (f, _) in global_feats if isinstance(f, capa.features.common.Arch)]
580-
extractor_os = [f.value for (f, _) in global_feats if isinstance(f, capa.features.common.OS)]
631+
extractor_format = [
632+
f.value for (f, _) in global_feats if isinstance(f, capa.features.common.Format)
633+
]
634+
extractor_arch = [
635+
f.value for (f, _) in global_feats if isinstance(f, capa.features.common.Arch)
636+
]
637+
extractor_os = [
638+
f.value for (f, _) in global_feats if isinstance(f, capa.features.common.OS)
639+
]
581640

582641
input_format = (
583-
str(extractor_format[0]) if extractor_format else "unknown" if input_format == FORMAT_AUTO else input_format
642+
str(extractor_format[0])
643+
if extractor_format
644+
else "unknown" if input_format == FORMAT_AUTO else input_format
584645
)
585646
arch = str(extractor_arch[0]) if extractor_arch else "unknown"
586647
os_ = str(extractor_os[0]) if extractor_os else "unknown" if os_ == OS_AUTO else os_
@@ -655,14 +716,18 @@ def result_rec(result: capa.features.common.Result):
655716
threads_by_process[p.address] = []
656717

657718
for t in extractor.get_threads(p):
658-
calls_by_thread[t.address] = []
719+
# use setdefault so that a recycled TID (same pid+tid seen again) accumulates
720+
# calls from all its instances rather than overwriting the prior instance's calls.
721+
calls_by_thread.setdefault(t.address, [])
659722

660723
for c in extractor.get_calls(p, t):
661724
if c.address in matched_calls:
662725
names_by_call[c.address] = extractor.get_call_name(p, t, c)
663726
calls_by_thread[t.address].append(c.address)
664727

665-
if calls_by_thread[t.address]:
728+
# only register the thread address once; a recycled TID must not create
729+
# a duplicate entry in threads_by_process or cause a double-add to matched_threads.
730+
if calls_by_thread[t.address] and t.address not in matched_threads:
666731
matched_threads.add(t.address)
667732
threads_by_process[p.address].append(t.address)
668733

@@ -700,7 +765,9 @@ def result_rec(result: capa.features.common.Result):
700765
return layout
701766

702767

703-
def compute_static_layout(rules: RuleSet, extractor: StaticFeatureExtractor, capabilities) -> rdoc.StaticLayout:
768+
def compute_static_layout(
769+
rules: RuleSet, extractor: StaticFeatureExtractor, capabilities
770+
) -> rdoc.StaticLayout:
704771
"""
705772
compute a metadata structure that links basic blocks
706773
to the functions in which they're found.
@@ -730,7 +797,9 @@ def compute_static_layout(rules: RuleSet, extractor: StaticFeatureExtractor, cap
730797
rdoc.FunctionLayout(
731798
address=frz.Address.from_capa(f),
732799
matched_basic_blocks=tuple(
733-
rdoc.BasicBlockLayout(address=frz.Address.from_capa(bb)) for bb in bbs if bb in matched_bbs
800+
rdoc.BasicBlockLayout(address=frz.Address.from_capa(bb))
801+
for bb in bbs
802+
if bb in matched_bbs
734803
), # this object is open to extension in the future,
735804
# such as with the function name, etc.
736805
)

0 commit comments

Comments
 (0)