Skip to content

Commit 2607374

Browse files
authored
Merge pull request #870 from microsoft/copilot/fix-ti-lookup-error
Fix pre-commit linting failures in TI provider Require clean build to progress other PRs in queue
2 parents 38e0cff + 4bca99b commit 2607374

File tree

89 files changed

+37857
-37272
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

89 files changed

+37857
-37272
lines changed

.ci_config/UserExclusion.xml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,11 @@
88

99
<!-- Each of these exclusions is a folder name - if any folder or file starts with "\[name]", it will be skipped -->
1010
<!--<Exclusion Type="FolderPathStart">ABC|XYZ</Exclusion>-->
11-
11+
1212
<!-- Each of these file types will be completely skipped for the entire scan -->
1313
<Exclusion Type="FileType">.CSV</Exclusion>
14-
14+
1515
<!-- The specified file names will be skipped during the scan regardless which folder they are in -->
1616
<Exclusion Type="FileName">TLD_SEED.TXT|QUERY_DATA.CSV|SIGNIN_CHARTS.YAML|GEOPIP.PY</Exclusion>
17-
17+
1818
</PoliCheckExclusions>

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,4 +41,4 @@ repos:
4141
pass_filenames: False
4242
language: python
4343
types: [python]
44-
additional_dependencies: ['packaging>=24.0']
44+
additional_dependencies: ['packaging>=24.0', 'setuptools>=42']

CITATION.cff

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
abstract: |
22
"Microsoft Threat Intelligence Python Security Tools - MSTICPy
3-
3+
44
A library for InfoSec investigation and hunting in Jupyter Notebooks. It includes functionality to:
55
- query log data from multiple sources
66
- enrich the data with threat intelligence, geo-locations and Azure resource data
77
- extract Indicators of Activity (IoA) from logs and unpack encoded data
88
- analyze for anomalous sessions and events
99
- visualize data using interactive timelines, process trees and multi-dimensional Morph Charts"
10-
authors:
10+
authors:
1111
- given-names: Ian
1212
family-names: Hellen
1313
affiliation: "Microsoft Corp."
@@ -22,7 +22,7 @@ authors:
2222
alias: ashwinpatil
2323
cff-version: "1.2.0"
2424
date-released: 2021-04-14
25-
keywords:
25+
keywords:
2626
- CyberSecurity
2727
- Jupyter
2828
- InfoSec

SECURITY.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ Instead, please report them to the Microsoft Security Response Center (MSRC) at
1414

1515
If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
1616

17-
You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc).
17+
You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc).
1818

1919
Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
2020

docs/generate_query_docs.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,13 @@
44
# license information.
55
# --------------------------------------------------------------------------
66
"""Generate documentation of current queries."""
7+
78
import argparse
89
from pathlib import Path
910

1011
import pandas as pd
11-
from tabulate import tabulate # type: ignore
1212
import tqdm
13+
from tabulate import tabulate # type: ignore
1314

1415
from msticpy.data import QueryProvider
1516

@@ -70,9 +71,7 @@ def get_query_list():
7071
}
7172
query_series.append(pd.Series(q_dict))
7273
print()
73-
return pd.DataFrame(query_series).sort_values(
74-
["Environment", "QueryGroup", "Query"]
75-
)
74+
return pd.DataFrame(query_series).sort_values(["Environment", "QueryGroup", "Query"])
7675

7776

7877
def generate_document(query_df): # sourcery skip: identity-comprehension

docs/notebooks/AWS_S3_HoneybucketLogAnalysis.ipynb

Lines changed: 46 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -276,18 +276,18 @@
276276
}
277277
],
278278
"source": [
279-
"import json\n",
280-
"import re\n",
281279
"import pprint\n",
282-
"from IPython.display import display, HTML\n",
280+
"import re\n",
281+
"\n",
283282
"import matplotlib\n",
284-
"import squarify\n",
285283
"import matplotlib.pyplot as plt\n",
284+
"import squarify\n",
285+
"from IPython.display import HTML, display\n",
286286
"\n",
287287
"%matplotlib inline\n",
288288
"\n",
289-
"REQ_PYTHON_VER=(3, 6)\n",
290-
"REQ_MSTICPY_VER=(1, 4, 4)\n",
289+
"REQ_PYTHON_VER = (3, 6)\n",
290+
"REQ_MSTICPY_VER = (1, 4, 4)\n",
291291
"\n",
292292
"display(HTML(\"<h3>Starting Notebook setup...</h3>\"))\n",
293293
"\n",
@@ -369,11 +369,11 @@
369369
],
370370
"source": [
371371
"# Specify the input log filename\n",
372-
"logfile_name = './data/AWS_Honeybucket_Logs.txt'\n",
372+
"logfile_name = \"./data/AWS_Honeybucket_Logs.txt\"\n",
373+
"\n",
374+
"with open(logfile_name) as f:\n",
375+
" input_logs = f.read()\n",
373376
"\n",
374-
"with open(logfile_name, 'r') as f:\n",
375-
" input_logs= f.read()\n",
376-
" \n",
377377
"print(f\"Total no of lines in the log file: {len(input_logs)}\")\n",
378378
"\n",
379379
"# Display first 20 lines from a log file\n",
@@ -453,13 +453,13 @@
453453
],
454454
"source": [
455455
"def clean_logfile(logfile_name):\n",
456-
" \"Function to spllit each alert and find and replace to create dictionary like key-value pairs\"\n",
456+
" \"\"\"Function to spllit each alert and find and replace to create dictionary like key-value pairs\"\"\"\n",
457457
" print(\"Splitting individiual alerts...\")\n",
458458
" s3log_records = re.split(\"AWS Request Details\", input_logs)\n",
459459
" s3clean_logs = []\n",
460460
" print(\"Find and replace the data into clean unified format...\")\n",
461461
" # Excluding first and last event which are not access alerts\n",
462-
" for logs in s3log_records[1:-1]: \n",
462+
" for logs in s3log_records[1:-1]:\n",
463463
" logs = re.sub(\"Event Type\\n\", \"Event Type:\", logs)\n",
464464
" logs = re.sub(\"Event Name\\n\", \"Event Name:\", logs)\n",
465465
" logs = re.sub(\"Request ID\\n\", \"Request ID:\", logs)\n",
@@ -482,7 +482,7 @@
482482
"\n",
483483
"\n",
484484
"def create_dicts(clean_logfile):\n",
485-
" \"Function to create key value pairs and return list of json records\"\n",
485+
" \"\"\"Function to create key value pairs and return list of json records\"\"\"\n",
486486
" list_of_json_records = []\n",
487487
" print(\"\\nCreating dictionary pairs from clean dataset...\")\n",
488488
" for event in clean_logfile:\n",
@@ -497,6 +497,7 @@
497497
" list_of_json_records.append(parsed_dict)\n",
498498
" return list_of_json_records\n",
499499
"\n",
500+
"\n",
500501
"display(HTML(\"<h4>Cleaning log file and creating structured json file...</h4>\"))\n",
501502
"s3clean_logs = clean_logfile(logfile_name)\n",
502503
"list_of_json_records = create_dicts(s3clean_logs)\n",
@@ -689,7 +690,7 @@
689690
}
690691
],
691692
"source": [
692-
"#Load list of JSON records into dataframe\n",
693+
"# Load list of JSON records into dataframe\n",
693694
"df = pd.DataFrame(list_of_json_records)\n",
694695
"\n",
695696
"df.head()"
@@ -1245,7 +1246,7 @@
12451246
}
12461247
],
12471248
"source": [
1248-
"select_ti = browse_results(ti_resp, severities=['high'])\n",
1249+
"select_ti = browse_results(ti_resp, severities=[\"high\"])\n",
12491250
"select_ti"
12501251
]
12511252
},
@@ -1325,19 +1326,15 @@
13251326
"source": [
13261327
"# Changing data type to datetime and formatting datetime objects\n",
13271328
"datetime_format = \"%Y-%m-%d %H:%M:%S %Z\"\n",
1328-
"df_enriched[\"Event DateTime\"] = pd.to_datetime(\n",
1329-
" df_enriched[\"Event DateTime\"]\n",
1330-
")\n",
1331-
"df_enriched[\"Alarm DateTime\"] = pd.to_datetime(\n",
1332-
" df_enriched[\"Alarm DateTime\"]\n",
1333-
")\n",
1329+
"df_enriched[\"Event DateTime\"] = pd.to_datetime(df_enriched[\"Event DateTime\"])\n",
1330+
"df_enriched[\"Alarm DateTime\"] = pd.to_datetime(df_enriched[\"Alarm DateTime\"])\n",
13341331
"\n",
13351332
"\n",
13361333
"# Sorting values and selecting first\n",
13371334
"first_alert = df_enriched.sort_values(by=\"Alarm DateTime\").head(1)\n",
13381335
"first_alert[\"Alarm DateTime\"] = first_alert[\"Alarm DateTime\"].dt.strftime(datetime_format)\n",
13391336
"\n",
1340-
"#Filter columns to display\n",
1337+
"# Filter columns to display\n",
13411338
"display_columns = [\n",
13421339
" \"Alarm DateTime\",\n",
13431340
" \"Source IP\",\n",
@@ -1346,7 +1343,7 @@
13461343
" \"Request User Agent\",\n",
13471344
"]\n",
13481345
"\n",
1349-
"#Display Alert\n",
1346+
"# Display Alert\n",
13501347
"first_alert[display_columns]"
13511348
]
13521349
},
@@ -1545,9 +1542,7 @@
15451542
"\n",
15461543
"df_enriched[\"MonthofYear\"] = df_enriched[\"Event DateTime\"].dt.strftime(\"%Y-%m\")\n",
15471544
"\n",
1548-
"monthly_df = (\n",
1549-
" df_enriched.groupby([\"MonthofYear\"])[\"MonthofYear\"].agg({\"count\"}).reset_index()\n",
1550-
")\n",
1545+
"monthly_df = df_enriched.groupby([\"MonthofYear\"])[\"MonthofYear\"].agg({\"count\"}).reset_index()\n",
15511546
"\n",
15521547
"# Display data\n",
15531548
"monthly_df"
@@ -1610,8 +1605,8 @@
16101605
],
16111606
"source": [
16121607
"print(\n",
1613-
" f'''No of Unique Countries seen: {len(df_enriched['CountryName'].unique())} \n",
1614-
" No of unique ASN : {len(df_enriched['SourceASN'].unique())}'''\n",
1608+
" f\"\"\"No of Unique Countries seen: {len(df_enriched[\"CountryName\"].unique())} \n",
1609+
" No of unique ASN : {len(df_enriched[\"SourceASN\"].unique())}\"\"\"\n",
16151610
")"
16161611
]
16171612
},
@@ -1647,7 +1642,7 @@
16471642
}
16481643
],
16491644
"source": [
1650-
"df_enriched['Event Name'].value_counts()"
1645+
"df_enriched[\"Event Name\"].value_counts()"
16511646
]
16521647
},
16531648
{
@@ -1765,7 +1760,7 @@
17651760
}
17661761
],
17671762
"source": [
1768-
"df_enriched['User ID'].value_counts().reset_index()"
1763+
"df_enriched[\"User ID\"].value_counts().reset_index()"
17691764
]
17701765
},
17711766
{
@@ -1924,9 +1919,9 @@
19241919
}
19251920
],
19261921
"source": [
1927-
"pd.set_option('max_colwidth', 200)\n",
1922+
"pd.set_option(\"max_colwidth\", 200)\n",
19281923
"\n",
1929-
"df_enriched.groupby(['Event Name','Request Parameters'])['Alert'].agg({'count'})"
1924+
"df_enriched.groupby([\"Event Name\", \"Request Parameters\"])[\"Alert\"].agg({\"count\"})"
19301925
]
19311926
},
19321927
{
@@ -1963,7 +1958,7 @@
19631958
}
19641959
],
19651960
"source": [
1966-
"df_enriched['Request User Agent'].value_counts().tail(5)"
1961+
"df_enriched[\"Request User Agent\"].value_counts().tail(5)"
19671962
]
19681963
},
19691964
{
@@ -2000,7 +1995,7 @@
20001995
}
20011996
],
20021997
"source": [
2003-
"df_enriched['CountryName'].value_counts().head(5)"
1998+
"df_enriched[\"CountryName\"].value_counts().head(5)"
20041999
]
20052000
},
20062001
{
@@ -2200,9 +2195,9 @@
22002195
],
22012196
"source": [
22022197
"# Repeat IP Addresses\n",
2203-
"df_grouped = df_enriched.groupby(['Source IP'])['Source IP'].agg({'count'})\n",
2198+
"df_grouped = df_enriched.groupby([\"Source IP\"])[\"Source IP\"].agg({\"count\"})\n",
22042199
"\n",
2205-
"df_grouped[df_grouped['count'] > 1].sort_values(by='count', ascending=False)"
2200+
"df_grouped[df_grouped[\"count\"] > 1].sort_values(by=\"count\", ascending=False)"
22062201
]
22072202
},
22082203
{
@@ -2360,9 +2355,9 @@
23602355
],
23612356
"source": [
23622357
"# Repeat IP Addresses\n",
2363-
"df_asngrouped = df_enriched.groupby(['SourceASN'])['Source IP'].agg({'count'})\n",
2358+
"df_asngrouped = df_enriched.groupby([\"SourceASN\"])[\"Source IP\"].agg({\"count\"})\n",
23642359
"\n",
2365-
"df_asngrouped[df_asngrouped['count'] > 1].sort_values(by='count', ascending=False)"
2360+
"df_asngrouped[df_asngrouped[\"count\"] > 1].sort_values(by=\"count\", ascending=False)"
23662361
]
23672362
},
23682363
{
@@ -2461,7 +2456,7 @@
24612456
}
24622457
],
24632458
"source": [
2464-
"ti_resp.groupby(['Severity', 'Provider'])['Ioc'].agg({'count'})"
2459+
"ti_resp.groupby([\"Severity\", \"Provider\"])[\"Ioc\"].agg({\"count\"})"
24652460
]
24662461
},
24672462
{
@@ -2505,14 +2500,15 @@
25052500
}
25062501
],
25072502
"source": [
2508-
"plt.rcParams[\"figure.figsize\"] = (12,6)\n",
2509-
"plt.style.use('seaborn-darkgrid')\n",
2503+
"plt.rcParams[\"figure.figsize\"] = (12, 6)\n",
2504+
"plt.style.use(\"seaborn-darkgrid\")\n",
25102505
"\n",
25112506
"fig, ax = plt.subplots()\n",
25122507
"ax.plot(monthly_df[\"MonthofYear\"], monthly_df[\"count\"])\n",
25132508
"\n",
2514-
"ax.set(xlabel='Per Month', ylabel='Count per each month',\n",
2515-
" title='Monthly distribution of alerts')\n",
2509+
"ax.set(\n",
2510+
" xlabel=\"Per Month\", ylabel=\"Count per each month\", title=\"Monthly distribution of alerts\"\n",
2511+
")\n",
25162512
"plt.xticks(rotation=60)\n",
25172513
"\n",
25182514
"plt.show()"
@@ -2550,9 +2546,7 @@
25502546
],
25512547
"source": [
25522548
"# Create dataset with count per Country\n",
2553-
"country_df = (\n",
2554-
" df_enriched.groupby([\"CountryName\"])[\"CountryName\"].agg({\"count\"}).reset_index()\n",
2555-
")\n",
2549+
"country_df = df_enriched.groupby([\"CountryName\"])[\"CountryName\"].agg({\"count\"}).reset_index()\n",
25562550
"\n",
25572551
"# normalize the count range to populate color pallette\n",
25582552
"norm = matplotlib.colors.Normalize(\n",
@@ -2617,9 +2611,7 @@
26172611
],
26182612
"source": [
26192613
"# Repeat IP Addresses\n",
2620-
"df_asngrouped = df_enriched.groupby([\"SourceASN\", \"CountryName\"])[\"Alert\"].agg(\n",
2621-
" {\"count\"}\n",
2622-
")\n",
2614+
"df_asngrouped = df_enriched.groupby([\"SourceASN\", \"CountryName\"])[\"Alert\"].agg({\"count\"})\n",
26232615
"\n",
26242616
"# Filter records with count less than 4\n",
26252617
"df2 = (\n",
@@ -2679,22 +2671,24 @@
26792671
"\n",
26802672
"folium_map = FoliumMap()\n",
26812673
"\n",
2674+
"\n",
26822675
"def format_ip_entity(row, ip_col):\n",
26832676
" ip_entity = entities.IpAddress(Address=row[ip_col])\n",
26842677
" iplocation.lookup_ip(ip_entity=ip_entity)\n",
26852678
" if \"severity\" in row:\n",
26862679
" ip_entity.AdditionalData[\"threat severity\"] = row[\"severity\"]\n",
26872680
" return ip_entity\n",
26882681
"\n",
2682+
"\n",
26892683
"# Filtering high and warning Ips to display on Geomap\n",
26902684
"ti_resp_threats = ti_resp[ti_resp.Severity.isin([\"high\", \"warning\"])]\n",
26912685
"\n",
26922686
"ips_threats = list(ti_resp_threats.apply(lambda x: format_ip_entity(x, \"Ioc\"), axis=1))\n",
26932687
"\n",
26942688
"# Convert our IP addresses in string format into an ip address entity\n",
26952689
"ip_entity = entities.IpAddress()\n",
2696-
"ip_list = [convert_to_ip_entities(i)[0] for i in ti_resp_threats['Ioc']]\n",
2697-
" \n",
2690+
"ip_list = [convert_to_ip_entities(i)[0] for i in ti_resp_threats[\"Ioc\"]]\n",
2691+
"\n",
26982692
"# Get center location of all IP locaitons to center the map on\n",
26992693
"location = get_map_center(ip_list)\n",
27002694
"s3bucket_map = FoliumMap(location=location, zoom_start=2)\n",
@@ -2703,7 +2697,7 @@
27032697
"if len(ip_list) > 0:\n",
27042698
" icon_props = {\"color\": \"red\"}\n",
27052699
" s3bucket_map.add_ip_cluster(ip_entities=ips_threats, **icon_props)\n",
2706-
" \n",
2700+
"\n",
27072701
"display(s3bucket_map.folium_map)"
27082702
]
27092703
},

0 commit comments

Comments
 (0)