3030"""
3131import os
3232import sys
33+ import json
3334import logging
3435import argparse
36+ import binascii
3537import collections
3638import multiprocessing
3739import multiprocessing .pool
38- from typing import Set , Dict , List
40+ from typing import Set , Dict , List , Union
3941from pathlib import Path
4042from datetime import date
4143
@@ -323,7 +325,7 @@ def get_sig_and_mask_for_dotnet_func(dnpe, body):
323325
324326 comment = ""
325327 sig = ""
326- func_bytes = ""
328+ func_bytes = b ""
327329 for insn in body .instructions :
328330 comment += (
329331 "{:04X}" .format (insn .offset )
@@ -335,11 +337,11 @@ def get_sig_and_mask_for_dotnet_func(dnpe, body):
335337 )
336338
337339 sig += insn .get_opcode_bytes ().hex ()
338- func_bytes += insn .get_opcode_bytes (). hex ()
340+ func_bytes += insn .get_opcode_bytes ()
339341
340342 if insn .operand :
341343 sig += "??" * len (insn .get_operand_bytes ())
342- func_bytes += insn .get_operand_bytes (). hex ()
344+ func_bytes += insn .get_operand_bytes ()
343345
344346 # Format the sig to be in the same style as the vivi portion (bytes seperated by spaces)
345347 formatted_sig = ""
@@ -357,11 +359,25 @@ def get_sig_and_mask_for_dotnet_func(dnpe, body):
357359class CodeFeature :
358360 """Basic object that that will be used to create yara rules"""
359361
360- def __init__ (self , sig : str , comment : str , bytez : bytes , filemd5 : str ):
361- self .sig = sig
362+ def __init__ (
363+ self , sig : str , comment : str , bytez : bytes , filemd5 : str , addr : Union [int , tuple [int , int ], None ], scope : str
364+ ):
365+ self .sig = sig .strip ().upper ()
362366 self .comment = comment
363367 self .bytez = bytez
368+ self .addr = addr
364369 self .filemd5 = filemd5
370+ self .scope = scope
371+
372+ def json (self ):
373+ return {
374+ "sig" : self .sig ,
375+ "comment" : self .comment ,
376+ "bytez" : binascii .hexlify (self .bytez , " " , bytes_per_sep = 1 ).decode ("utf8" ).upper (),
377+ "addr" : self .addr ,
378+ "filemd5" : self .filemd5 ,
379+ "scope" : self .scope ,
380+ }
365381
366382
367383def get_code_features_for_capa_doc (doc : rd .ResultDocument , extractor ):
@@ -411,7 +427,7 @@ def get_code_features_for_capa_doc(doc: rd.ResultDocument, extractor):
411427
412428 bytez = get_cb_bytes (file_vw , addr )
413429 sig = genSigAndMask (addr , bytez , doc .meta .analysis .arch )
414- code_features .append (CodeFeature (sig , comment , bytez , filemd5 ))
430+ code_features .append (CodeFeature (sig , comment , bytez , filemd5 , addr , capa . rules . BASIC_BLOCK_SCOPE ))
415431
416432 for addr , rules in func_matches .items ():
417433 comment = f"function at 0x{ addr :08x} @{ filemd5 } with { len (rules )} features:\n "
@@ -421,7 +437,7 @@ def get_code_features_for_capa_doc(doc: rd.ResultDocument, extractor):
421437
422438 bytez = get_function_bytes (file_vw , addr )
423439 sig = genSigAndMask (addr , bytez , doc .meta .analysis .arch )
424- code_features .append (CodeFeature (sig , comment , bytez , filemd5 ))
440+ code_features .append (CodeFeature (sig , comment , bytez , filemd5 , addr , capa . rules . FUNCTION_SCOPE ))
425441
426442 if len (code_features ) == 0 :
427443 logger .warning ("No code features found for %s" , filemd5 )
@@ -479,7 +495,7 @@ def get_code_features_for_dotnet_doc(doc: rd.ResultDocument, extractor):
479495 func_comment , sig , bytez = get_sig_and_mask_for_dotnet_func (dnpe , f .inner )
480496 comment += func_comment
481497
482- code_features .append (CodeFeature (sig , comment , bytez , filemd5 ))
498+ code_features .append (CodeFeature (sig , comment , bytez , filemd5 , addr , capa . rules . FUNCTION_SCOPE ))
483499
484500 if len (code_features ) == 0 :
485501 logger .warning ("No code features found for %s" , filemd5 )
@@ -596,6 +612,7 @@ def multi_process_capa(argv=None):
596612 parser .add_argument ("input" , type = str , nargs = "+" , help = "Path to directory or files to analyze" )
597613 parser .add_argument ("-n" , "--parallelism" , type = int , default = multiprocessing .cpu_count (), help = "parallelism factor" )
598614 parser .add_argument ("--no-mp" , action = "store_true" , help = "disable subprocesses" )
615+ parser .add_argument ("--dump-features" , action = "store_true" , help = "output feature dictionary as json" )
599616 args = parser .parse_args (args = argv )
600617 capa .main .handle_common_args (args )
601618
@@ -665,9 +682,22 @@ def map(f, args, parallelism=None):
665682
666683 logger .info ("Done processing %s samples" , len (samples ))
667684
685+ if args .dump_features :
686+ dump_file_features (results )
687+ sys .exit (0 )
688+
668689 return results
669690
670691
692+ # Output related functions
693+
694+
695+ def dump_file_features (result_dict : dict ):
696+ """Print out bytes for the code features extracted"""
697+ output_dict = {filemd5 : [x .json () for x in features ] for filemd5 , features in result_dict .items ()}
698+ print (json .dumps (output_dict , indent = 4 ))
699+
700+
671701# YARA related functions
672702
673703CODE_FEATURES_REFERENCED : List [CodeFeature ] = []
0 commit comments