Skip to content

Commit caf1a12

Browse files
Merge pull request #28 from ubc-provenance/dev
fix dataset parsing issue
2 parents e5f2d09 + c0ceef0 commit caf1a12

File tree

1 file changed

+90
-52
lines changed

1 file changed

+90
-52
lines changed

dataset_preprocessing/darpa_tc/create_database_e3.py

Lines changed: 90 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from pidsmaker.config import get_runtime_required_args, get_yml_cfg
88
from pidsmaker.utils.dataset_utils import edge_reversed, exclude_edge_type
99
from pidsmaker.utils.utils import init_database_connection, log
10+
import json
1011

1112
from . import filelist
1213

@@ -27,31 +28,40 @@ def store_netflow(file_path, cur, connect, index_id, filelist):
2728
for file in tqdm(filelist):
2829
with open(file_path + file, "r") as f:
2930
for line in f:
30-
if '{"datum":{"com.bbn.tc.schema.avro.cdm18.NetFlowObject"' in line:
31-
try:
32-
# res = re.findall(
33-
# 'NetFlowObject":{"uuid":"(.*?)"(.*?)"localAddress":{"string":"(.*?)"},"localPort":{"int":(.*?)},"remoteAddress":{"string":"(.*?)"},"remotePort":{"int":(.*?)}',
34-
# line)[0]
35-
res = re.findall(
36-
'NetFlowObject":{"uuid":"(.*?)"(.*?)"localAddress":"(.*?)","localPort":(.*?),"remoteAddress":"(.*?)","remotePort":(.*?),',
37-
line,
38-
)[0]
39-
40-
nodeid = res[0]
41-
srcaddr = res[2]
42-
srcport = res[3]
43-
dstaddr = res[4]
44-
dstport = res[5]
45-
46-
nodeproperty = srcaddr + "," + srcport + "," + dstaddr + "," + dstport
47-
hashstr = stringtomd5(nodeid)
48-
netobj2hash[nodeid] = [hashstr, nodeproperty]
49-
netobj2hash[hashstr] = nodeid
50-
netobjset.add(hashstr)
51-
successful_num += 1
52-
except:
53-
failed_num += 1
54-
pass
31+
if '{"datum":{"com.bbn.tc.schema.avro.cdm18.NetFlowObject"' not in line:
32+
continue
33+
34+
try:
35+
obj = json.loads(line)
36+
netobj = obj["datum"]["com.bbn.tc.schema.avro.cdm18.NetFlowObject"]
37+
38+
nodeid = netobj["uuid"]
39+
40+
srcaddr = netobj.get("localAddress", "null")
41+
srcport = netobj.get("localPort", "null")
42+
dstaddr = netobj.get("remoteAddress", "null")
43+
dstport = netobj.get("remotePort", "null")
44+
45+
if isinstance(srcaddr, dict):
46+
srcaddr = srcaddr.get("string", "null")
47+
if isinstance(dstaddr, dict):
48+
dstaddr = dstaddr.get("string", "null")
49+
if isinstance(srcport, dict):
50+
srcport = str(srcport.get("int", "null"))
51+
if isinstance(dstport, dict):
52+
dstport = str(dstport.get("int", "null"))
53+
54+
nodeproperty = f"{str(srcaddr)},{str(srcport)},{str(dstaddr)},{str(dstport)}"
55+
hashstr = stringtomd5(nodeid)
56+
57+
netobj2hash[nodeid] = [hashstr, nodeproperty]
58+
netobj2hash[hashstr] = nodeid
59+
netobjset.add(hashstr)
60+
61+
successful_num += 1
62+
63+
except Exception as e:
64+
failed_num += 1
5565

5666
# Store data into database
5767
datalist = []
@@ -81,24 +91,35 @@ def store_subject(file_path, cur, connect, index_id, filelist):
8191
for file in tqdm(filelist):
8292
with open(file_path + file, "r") as f:
8393
for line in f:
84-
if '{"datum":{"com.bbn.tc.schema.avro.cdm18.Subject"' in line:
85-
subject_uuid = re.findall(
86-
'Subject":{"uuid":"(.*?)"(.*?)"cmdLine":{"string":"(.*?)"}(.*?)"path":"(.*?)"',
87-
line,
88-
)
89-
90-
try:
91-
subject_obj2hash[subject_uuid[0][0]] = [
92-
subject_uuid[0][-1],
93-
subject_uuid[0][-3],
94-
] # {uuid:[path, cmd]}
95-
success_count += 1
96-
except:
97-
try:
98-
subject_obj2hash[subject_uuid[0][0]] = ["null", subject_uuid[0][-3]]
99-
except:
100-
pass
101-
fail_count += 1
94+
if '{"datum":{"com.bbn.tc.schema.avro.cdm18.Subject"' not in line:
95+
continue
96+
97+
try:
98+
obj = json.loads(line)
99+
subject = obj["datum"]["com.bbn.tc.schema.avro.cdm18.Subject"]
100+
101+
uuid = subject["uuid"]
102+
103+
cmd = "null"
104+
cmd_raw = subject.get("cmdLine")
105+
106+
if isinstance(cmd_raw, str):
107+
# in cadets_e3
108+
cmd = cmd_raw
109+
elif isinstance(cmd_raw, dict):
110+
# in theia_e3 / clearscope_e3
111+
cmd = cmd_raw.get("string", "null")
112+
113+
path = "null"
114+
props = subject.get("properties", {}).get("map", {})
115+
if "path" in props:
116+
path = props["path"]
117+
118+
subject_obj2hash[uuid] = [path, cmd]
119+
success_count += 1
120+
except Exception as e:
121+
fail_count += 1
122+
102123
# Store into database
103124
datalist = []
104125
subject_uuid2hash = {}
@@ -128,15 +149,32 @@ def store_file(file_path, cur, connect, index_id, filelist):
128149
for file in tqdm(filelist):
129150
with open(file_path + file, "r") as f:
130151
for line in f:
131-
if '{"datum":{"com.bbn.tc.schema.avro.cdm18.FileObject"' in line:
132-
Object_uuid = re.findall(
133-
'FileObject":{"uuid":"(.*?)",(.*?)"filename":"(.*?)"', line
134-
)
135-
try:
136-
file_obj2hash[Object_uuid[0][0]] = Object_uuid[0][-1]
137-
success_count += 1
138-
except:
139-
fail_count += 1
152+
if '{"datum":{"com.bbn.tc.schema.avro.cdm18.FileObject"' not in line:
153+
continue
154+
155+
try:
156+
obj = json.loads(line)
157+
fileobj = obj["datum"]["com.bbn.tc.schema.avro.cdm18.FileObject"]
158+
uuid = fileobj["uuid"]
159+
160+
filename = "null"
161+
base = fileobj.get("baseObject", {})
162+
props = base.get("properties", {}).get("map", {})
163+
164+
if "filename" in base:
165+
filename = base["filename"]
166+
elif "path" in base:
167+
filename = base["path"]
168+
169+
if "filename" in props:
170+
filename = props["filename"]
171+
elif "path" in props:
172+
filename = props["path"]
173+
174+
file_obj2hash[uuid] = filename
175+
176+
except Exception as e:
177+
fail_count += 1
140178

141179
datalist = []
142180
file_uuid2hash = {}

0 commit comments

Comments
 (0)