3131import capa .features .extractors .common
3232from capa .rules import RuleSet
3333from capa .engine import MatchResults
34- from capa .exceptions import UnsupportedOSError , UnsupportedArchError , UnsupportedFormatError
34+ from capa .exceptions import (
35+ UnsupportedOSError ,
36+ UnsupportedArchError ,
37+ UnsupportedFormatError ,
38+ )
3539from capa .features .common import (
3640 OS_AUTO ,
3741 FORMAT_PE ,
@@ -160,9 +164,13 @@ def get_workspace(path: Path, input_format: str, sigpaths: list[Path]):
160164 vw = viv_utils .getWorkspace (str (path ), analyze = False , should_save = False )
161165 elif input_format == FORMAT_SC32 :
162166 # these are not analyzed nor saved.
163- vw = viv_utils .getShellcodeWorkspaceFromFile (str (path ), arch = "i386" , analyze = False )
167+ vw = viv_utils .getShellcodeWorkspaceFromFile (
168+ str (path ), arch = "i386" , analyze = False
169+ )
164170 elif input_format == FORMAT_SC64 :
165- vw = viv_utils .getShellcodeWorkspaceFromFile (str (path ), arch = "amd64" , analyze = False )
171+ vw = viv_utils .getShellcodeWorkspaceFromFile (
172+ str (path ), arch = "amd64" , analyze = False
173+ )
166174 else :
167175 raise ValueError ("unexpected format: " + input_format )
168176 except envi .exc .SegmentationViolation as e :
@@ -231,20 +239,26 @@ def get_extractor(
231239 import capa .features .extractors .drakvuf .extractor
232240
233241 report = capa .helpers .load_jsonl_from_path (input_path )
234- return capa .features .extractors .drakvuf .extractor .DrakvufExtractor .from_report (report )
242+ return capa .features .extractors .drakvuf .extractor .DrakvufExtractor .from_report (
243+ report
244+ )
235245
236246 elif backend == BACKEND_VMRAY :
237247 import capa .features .extractors .vmray .extractor
238248
239- return capa .features .extractors .vmray .extractor .VMRayExtractor .from_zipfile (input_path )
249+ return capa .features .extractors .vmray .extractor .VMRayExtractor .from_zipfile (
250+ input_path
251+ )
240252
241253 elif backend == BACKEND_DOTNET :
242254 import capa .features .extractors .dnfile .extractor
243255
244256 if input_format not in (FORMAT_PE , FORMAT_DOTNET ):
245257 raise UnsupportedFormatError ()
246258
247- return capa .features .extractors .dnfile .extractor .DnfileFeatureExtractor (input_path )
259+ return capa .features .extractors .dnfile .extractor .DnfileFeatureExtractor (
260+ input_path
261+ )
248262
249263 elif backend == BACKEND_BINJA :
250264 import capa .features .extractors .binja .find_binja_api as finder
@@ -303,11 +317,15 @@ def get_extractor(
303317 vw .saveWorkspace ()
304318 except IOError :
305319 # see #168 for discussion around how to handle non-writable directories
306- logger .info ("source directory is not writable, won't save intermediate workspace" )
320+ logger .info (
321+ "source directory is not writable, won't save intermediate workspace"
322+ )
307323 else :
308324 logger .debug ("CAPA_SAVE_WORKSPACE unset, not saving workspace" )
309325
310- return capa .features .extractors .viv .extractor .VivisectFeatureExtractor (vw , input_path , os_ )
326+ return capa .features .extractors .viv .extractor .VivisectFeatureExtractor (
327+ vw , input_path , os_
328+ )
311329
312330 elif backend == BACKEND_FREEZE :
313331 return frz .load (input_path .read_bytes ())
@@ -320,7 +338,9 @@ def get_extractor(
320338 assert sample_path is not None
321339 buf = sample_path .read_bytes ()
322340
323- return capa .features .extractors .binexport2 .extractor .BinExport2FeatureExtractor (be2 , buf )
341+ return capa .features .extractors .binexport2 .extractor .BinExport2FeatureExtractor (
342+ be2 , buf
343+ )
324344
325345 elif backend == BACKEND_IDA :
326346 import capa .features .extractors .ida .idalib as idalib
@@ -351,7 +371,9 @@ def get_extractor(
351371 # -1 - Generic errors (database already open, auto-analysis failed, etc.)
352372 # -2 - User cancelled operation
353373 ret = idapro .open_database (
354- str (input_path ), run_auto_analysis = True , args = "-Olumina:host=0.0.0.0 -Osecondary_lumina:host=0.0.0.0 -R"
374+ str (input_path ),
375+ run_auto_analysis = True ,
376+ args = "-Olumina:host=0.0.0.0 -Osecondary_lumina:host=0.0.0.0 -R" ,
355377 )
356378 if ret != 0 :
357379 raise RuntimeError ("failed to analyze input file" )
@@ -386,12 +408,19 @@ def get_extractor(
386408 monitor = TaskMonitor .DUMMY
387409
388410 # Import file
389- loader = pyghidra .program_loader ().project (project ).source (str (input_path )).name (input_path .name )
411+ loader = (
412+ pyghidra .program_loader ()
413+ .project (project )
414+ .source (str (input_path ))
415+ .name (input_path .name )
416+ )
390417 with loader .load () as load_results :
391418 load_results .save (monitor )
392419
393420 # Open program
394- program , consumer = pyghidra .consume_program (project , "/" + input_path .name )
421+ program , consumer = pyghidra .consume_program (
422+ project , "/" + input_path .name
423+ )
395424
396425 # Analyze
397426 pyghidra .analyze (program , monitor )
@@ -424,7 +453,9 @@ def __exit__(self, exc_type, exc_val, exc_tb):
424453
425454 import capa .features .extractors .ghidra .extractor
426455
427- return capa .features .extractors .ghidra .extractor .GhidraFeatureExtractor (ctx_manager = cm , tmpdir = tmpdir )
456+ return capa .features .extractors .ghidra .extractor .GhidraFeatureExtractor (
457+ ctx_manager = cm , tmpdir = tmpdir
458+ )
428459 else :
429460 raise ValueError ("unexpected backend: " + backend )
430461
@@ -461,37 +492,55 @@ def get_file_extractors(input_file: Path, input_format: str) -> list[FeatureExtr
461492 if input_format == FORMAT_PE :
462493 import capa .features .extractors .pefile
463494
464- file_extractors .append (capa .features .extractors .pefile .PefileFeatureExtractor (input_file ))
495+ file_extractors .append (
496+ capa .features .extractors .pefile .PefileFeatureExtractor (input_file )
497+ )
465498
466499 elif input_format == FORMAT_DOTNET :
467500 import capa .features .extractors .pefile
468501 import capa .features .extractors .dotnetfile
469502
470- file_extractors .append (capa .features .extractors .pefile .PefileFeatureExtractor (input_file ))
471- file_extractors .append (capa .features .extractors .dotnetfile .DotnetFileFeatureExtractor (input_file ))
503+ file_extractors .append (
504+ capa .features .extractors .pefile .PefileFeatureExtractor (input_file )
505+ )
506+ file_extractors .append (
507+ capa .features .extractors .dotnetfile .DotnetFileFeatureExtractor (input_file )
508+ )
472509
473510 elif input_format == FORMAT_ELF :
474511 import capa .features .extractors .elffile
475512
476- file_extractors .append (capa .features .extractors .elffile .ElfFeatureExtractor (input_file ))
513+ file_extractors .append (
514+ capa .features .extractors .elffile .ElfFeatureExtractor (input_file )
515+ )
477516
478517 elif input_format == FORMAT_CAPE :
479518 import capa .features .extractors .cape .extractor
480519
481520 report = capa .helpers .load_json_from_path (input_file )
482- file_extractors .append (capa .features .extractors .cape .extractor .CapeExtractor .from_report (report ))
521+ file_extractors .append (
522+ capa .features .extractors .cape .extractor .CapeExtractor .from_report (report )
523+ )
483524
484525 elif input_format == FORMAT_DRAKVUF :
485526 import capa .helpers
486527 import capa .features .extractors .drakvuf .extractor
487528
488529 report = capa .helpers .load_jsonl_from_path (input_file )
489- file_extractors .append (capa .features .extractors .drakvuf .extractor .DrakvufExtractor .from_report (report ))
530+ file_extractors .append (
531+ capa .features .extractors .drakvuf .extractor .DrakvufExtractor .from_report (
532+ report
533+ )
534+ )
490535
491536 elif input_format == FORMAT_VMRAY :
492537 import capa .features .extractors .vmray .extractor
493538
494- file_extractors .append (capa .features .extractors .vmray .extractor .VMRayExtractor .from_zipfile (input_file ))
539+ file_extractors .append (
540+ capa .features .extractors .vmray .extractor .VMRayExtractor .from_zipfile (
541+ input_file
542+ )
543+ )
495544
496545 elif input_format == FORMAT_BINEXPORT2 :
497546 file_extractors = _get_binexport2_file_extractors (input_file )
@@ -501,7 +550,9 @@ def get_file_extractors(input_file: Path, input_format: str) -> list[FeatureExtr
501550
502551def get_signatures (sigs_path : Path ) -> list [Path ]:
503552 if not sigs_path .exists ():
504- raise IOError (f"signatures path { sigs_path } does not exist or cannot be accessed" )
553+ raise IOError (
554+ f"signatures path { sigs_path } does not exist or cannot be accessed"
555+ )
505556
506557 paths : list [Path ] = []
507558 if sigs_path .is_file ():
@@ -525,7 +576,9 @@ def get_signatures(sigs_path: Path) -> list[Path]:
525576 return paths
526577
527578
528- def get_sample_analysis (format_ , arch , os_ , extractor , rules_path , feature_counts , library_functions ):
579+ def get_sample_analysis (
580+ format_ , arch , os_ , extractor , rules_path , feature_counts , library_functions
581+ ):
529582 if isinstance (extractor , StaticFeatureExtractor ):
530583 return rdoc .StaticAnalysis (
531584 format = format_ ,
@@ -575,12 +628,20 @@ def collect_metadata(
575628 md5 , sha1 , sha256 = sample_hashes .md5 , sample_hashes .sha1 , sample_hashes .sha256
576629
577630 global_feats = list (extractor .extract_global_features ())
578- extractor_format = [f .value for (f , _ ) in global_feats if isinstance (f , capa .features .common .Format )]
579- extractor_arch = [f .value for (f , _ ) in global_feats if isinstance (f , capa .features .common .Arch )]
580- extractor_os = [f .value for (f , _ ) in global_feats if isinstance (f , capa .features .common .OS )]
631+ extractor_format = [
632+ f .value for (f , _ ) in global_feats if isinstance (f , capa .features .common .Format )
633+ ]
634+ extractor_arch = [
635+ f .value for (f , _ ) in global_feats if isinstance (f , capa .features .common .Arch )
636+ ]
637+ extractor_os = [
638+ f .value for (f , _ ) in global_feats if isinstance (f , capa .features .common .OS )
639+ ]
581640
582641 input_format = (
583- str (extractor_format [0 ]) if extractor_format else "unknown" if input_format == FORMAT_AUTO else input_format
642+ str (extractor_format [0 ])
643+ if extractor_format
644+ else "unknown" if input_format == FORMAT_AUTO else input_format
584645 )
585646 arch = str (extractor_arch [0 ]) if extractor_arch else "unknown"
586647 os_ = str (extractor_os [0 ]) if extractor_os else "unknown" if os_ == OS_AUTO else os_
@@ -655,14 +716,18 @@ def result_rec(result: capa.features.common.Result):
655716 threads_by_process [p .address ] = []
656717
657718 for t in extractor .get_threads (p ):
658- calls_by_thread [t .address ] = []
719+ # use setdefault so that a recycled TID (same pid+tid seen again) accumulates
720+ # calls from all its instances rather than overwriting the prior instance's calls.
721+ calls_by_thread .setdefault (t .address , [])
659722
660723 for c in extractor .get_calls (p , t ):
661724 if c .address in matched_calls :
662725 names_by_call [c .address ] = extractor .get_call_name (p , t , c )
663726 calls_by_thread [t .address ].append (c .address )
664727
665- if calls_by_thread [t .address ]:
728+ # only register the thread address once; a recycled TID must not create
729+ # a duplicate entry in threads_by_process or cause a double-add to matched_threads.
730+ if calls_by_thread [t .address ] and t .address not in matched_threads :
666731 matched_threads .add (t .address )
667732 threads_by_process [p .address ].append (t .address )
668733
@@ -700,7 +765,9 @@ def result_rec(result: capa.features.common.Result):
700765 return layout
701766
702767
703- def compute_static_layout (rules : RuleSet , extractor : StaticFeatureExtractor , capabilities ) -> rdoc .StaticLayout :
768+ def compute_static_layout (
769+ rules : RuleSet , extractor : StaticFeatureExtractor , capabilities
770+ ) -> rdoc .StaticLayout :
704771 """
705772 compute a metadata structure that links basic blocks
706773 to the functions in which they're found.
@@ -730,7 +797,9 @@ def compute_static_layout(rules: RuleSet, extractor: StaticFeatureExtractor, cap
730797 rdoc .FunctionLayout (
731798 address = frz .Address .from_capa (f ),
732799 matched_basic_blocks = tuple (
733- rdoc .BasicBlockLayout (address = frz .Address .from_capa (bb )) for bb in bbs if bb in matched_bbs
800+ rdoc .BasicBlockLayout (address = frz .Address .from_capa (bb ))
801+ for bb in bbs
802+ if bb in matched_bbs
734803 ), # this object is open to extension in the future,
735804 # such as with the function name, etc.
736805 )
0 commit comments