66class ScanXml (strelka .Scanner ):
77 """Collects metadata and extracts embedded files from XML files.
88
9+ Extracts JavaScript content from script tags and emits them as child files.
10+
911 Options:
1012 extract_tags: List of XML tags that will have their text extracted
1113 as child files.
@@ -23,7 +25,7 @@ def scan(self, data, file, options, expire_at):
2325 self .event .setdefault ('tags' , [])
2426 self .event .setdefault ('tag_data' , [])
2527 self .event .setdefault ('namespaces' , [])
26- self .event ['total' ] = {'tags' : 0 , 'extracted' : 0 }
28+ self .event ['total' ] = {'tags' : 0 , 'scripts' : 0 , ' extracted' : 0 }
2729
2830 xml = None
2931 try :
@@ -89,6 +91,30 @@ def _recurse_node(self, node, xml_args):
8991 self .files .append (extract_file )
9092 self .event ['total' ]['extracted' ] += 1
9193
94+ # Check for script tags and extract JavaScript content
95+ if tag == 'script' :
96+ self .event ['total' ]['scripts' ] += 1
97+
98+ if text and text .strip ():
99+ extract_file = strelka .File (
100+ name = f'script_{ self .event ["total" ]["scripts" ]- 1 } ' ,
101+ source = self .name ,
102+ )
103+ script_flavors = [
104+ node .attrib .get ('type' , '' ).lower (),
105+ ]
106+ extract_file .add_flavors ({'external' : script_flavors })
107+
108+ for c in strelka .chunk_string (text ):
109+ self .upload_to_coordinator (
110+ extract_file .pointer ,
111+ c ,
112+ self .expire_at ,
113+ )
114+
115+ self .files .append (extract_file )
116+ self .event ['total' ]['extracted' ] += 1
117+
92118 for child in node .getchildren ():
93119 self ._recurse_node (self , child , xml_args )
94120
0 commit comments