Skip to content

Commit cee9c46

Browse files
authored
Merge pull request #30 from ubc-provenance/dev
Add Docstring + ruff formatting
2 parents c2683d9 + c341f88 commit cee9c46

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+1152
-417
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,3 +185,5 @@ postgres_data
185185

186186
# tokenizer data
187187
nltk_data
188+
189+
.claude

Dockerfile

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,15 @@ ENV PATH=$JAVA_HOME/bin:$PATH
1818
# installing sudo
1919
RUN apt-get update && apt-get install -y sudo git
2020

21+
# installing node for Claude Code
22+
RUN apt-get update \
23+
&& apt-get install -y ca-certificates curl \
24+
&& echo "deb [trusted=yes] https://deb.nodesource.com/node_20.x nodistro main" > /etc/apt/sources.list.d/nodesource.list \
25+
&& apt-get update \
26+
&& apt-get install -y nodejs \
27+
&& node -v && npm -v \
28+
&& rm -rf /var/lib/apt/lists/*
29+
2130
# installing Anaconda version 23.3.1
2231
RUN wget https://repo.anaconda.com/archive/Anaconda3-2023.03-1-Linux-x86_64.sh
2332
RUN bash Anaconda3-2023.03-1-Linux-x86_64.sh -b -p /opt/conda

dataset_preprocessing/darpa_tc/create_database_e3.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,12 @@
1+
"""DARPA TC E3 dataset preprocessing - create PostgreSQL database from JSON logs.
2+
3+
Parses DARPA TC Engagement 3 provenance JSON files and populates PostgreSQL
4+
database with nodes and events for graph construction. Handles Trace, Theia,
5+
ClearScope, and Cadets datasets.
6+
"""
7+
18
import hashlib
9+
import json
210
import re
311

412
from psycopg2 import extras as ex
@@ -7,7 +15,6 @@
715
from pidsmaker.config import get_runtime_required_args, get_yml_cfg
816
from pidsmaker.utils.dataset_utils import edge_reversed, exclude_edge_type
917
from pidsmaker.utils.utils import init_database_connection, log
10-
import json
1118

1219
from . import filelist
1320

@@ -93,7 +100,7 @@ def store_subject(file_path, cur, connect, index_id, filelist):
93100
for line in f:
94101
if '{"datum":{"com.bbn.tc.schema.avro.cdm18.Subject"' not in line:
95102
continue
96-
103+
97104
try:
98105
obj = json.loads(line)
99106
subject = obj["datum"]["com.bbn.tc.schema.avro.cdm18.Subject"]
@@ -151,7 +158,7 @@ def store_file(file_path, cur, connect, index_id, filelist):
151158
for line in f:
152159
if '{"datum":{"com.bbn.tc.schema.avro.cdm18.FileObject"' not in line:
153160
continue
154-
161+
155162
try:
156163
obj = json.loads(line)
157164
fileobj = obj["datum"]["com.bbn.tc.schema.avro.cdm18.FileObject"]
@@ -161,18 +168,18 @@ def store_file(file_path, cur, connect, index_id, filelist):
161168
base = fileobj.get("baseObject", {})
162169
props = base.get("properties", {}).get("map", {})
163170

164-
if "filename" in base:
171+
if "filename" in base:
165172
filename = base["filename"]
166-
elif "path" in base:
173+
elif "path" in base:
167174
filename = base["path"]
168175

169-
if "filename" in props:
176+
if "filename" in props:
170177
filename = props["filename"]
171-
elif "path" in props:
178+
elif "path" in props:
172179
filename = props["path"]
173180

174181
file_obj2hash[uuid] = filename
175-
182+
176183
except Exception as e:
177184
fail_count += 1
178185

dataset_preprocessing/darpa_tc/create_database_e5.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
"""DARPA TC E5 dataset preprocessing - create PostgreSQL database from JSON logs.
2+
3+
Parses DARPA TC Engagement 5 provenance JSON files and populates PostgreSQL
4+
database with nodes and events for graph construction. Handles Trace, Theia,
5+
ClearScope, and Cadets datasets.
6+
"""
7+
18
import re
29

310
from psycopg2 import extras as ex

0 commit comments

Comments
 (0)