microsoft
diff --git a/‎.ci_config/UserExclusion.xml‎
Lines changed: 3 additions & 3 deletions b/‎.ci_config/UserExclusion.xml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CITATION.cff‎
Lines changed: 3 additions & 3 deletions b/‎CITATION.cff‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎SECURITY.md‎
Lines changed: 1 addition & 1 deletion b/‎SECURITY.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/generate_query_docs.py‎
Lines changed: 3 additions & 4 deletions b/‎docs/generate_query_docs.py‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎docs/notebooks/AWS_S3_HoneybucketLogAnalysis.ipynb‎
Lines changed: 46 additions & 52 deletions b/‎docs/notebooks/AWS_S3_HoneybucketLogAnalysis.ipynb‎
Lines changed: 46 additions & 52 deletions
@@ -8,11 +8,11 @@
 
   <!-- Each of these exclusions is a folder name - if any folder or file starts with "\[name]", it will be skipped -->
   <!--<Exclusion Type="FolderPathStart">ABC|XYZ</Exclusion>-->
-  
+
   <!-- Each of these file types will be completely skipped for the entire scan -->
   <Exclusion Type="FileType">.CSV</Exclusion>
-  
+
   <!-- The specified file names will be skipped during the scan regardless which folder they are in -->
   <Exclusion Type="FileName">TLD_SEED.TXT|QUERY_DATA.CSV|SIGNIN_CHARTS.YAML|GEOPIP.PY</Exclusion>
-  
+
   </PoliCheckExclusions>
@@ -41,4 +41,4 @@ repos:
         pass_filenames: False
         language: python
         types: [python]
-        additional_dependencies: ['packaging>=24.0']
+        additional_dependencies: ['packaging>=24.0', 'setuptools>=42']
@@ -1,13 +1,13 @@
 abstract: |
     "Microsoft Threat Intelligence Python Security Tools - MSTICPy
-    
+
     A library for InfoSec investigation and hunting in Jupyter Notebooks. It includes functionality to:
     - query log data from multiple sources
     - enrich the data with threat intelligence, geo-locations and Azure resource data
     - extract Indicators of Activity (IoA) from logs and unpack encoded data
     - analyze for anomalous sessions and  events
     - visualize data using interactive timelines, process trees and multi-dimensional Morph Charts"
-authors: 
+authors:
 - given-names: Ian
   family-names: Hellen
   affiliation: "Microsoft Corp."
@@ -22,7 +22,7 @@ authors:
   alias: ashwinpatil
 cff-version: "1.2.0"
 date-released: 2021-04-14
-keywords: 
+keywords:
   - CyberSecurity
   - Jupyter
   - InfoSec
 
@@ -14,7 +14,7 @@ Instead, please report them to the Microsoft Security Response Center (MSRC) at
 
 If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
 
-You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 
+You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc).
 
 Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
 
 
@@ -4,12 +4,13 @@
 # license information.
 # --------------------------------------------------------------------------
 """Generate documentation of current queries."""
+
 import argparse
 from pathlib import Path
 
 import pandas as pd
-from tabulate import tabulate  # type: ignore
 import tqdm
+from tabulate import tabulate  # type: ignore
 
 from msticpy.data import QueryProvider
 
@@ -70,9 +71,7 @@ def get_query_list():
             }
             query_series.append(pd.Series(q_dict))
     print()
-    return pd.DataFrame(query_series).sort_values(
-        ["Environment", "QueryGroup", "Query"]
-    )
+    return pd.DataFrame(query_series).sort_values(["Environment", "QueryGroup", "Query"])
 
 
 def generate_document(query_df):  # sourcery skip: identity-comprehension
 
@@ -276,18 +276,18 @@
     }
    ],
    "source": [
-    "import json\n",
-    "import re\n",
     "import pprint\n",
-    "from IPython.display import display, HTML\n",
+    "import re\n",
+    "\n",
     "import matplotlib\n",
-    "import squarify\n",
     "import matplotlib.pyplot as plt\n",
+    "import squarify\n",
+    "from IPython.display import HTML, display\n",
     "\n",
     "%matplotlib inline\n",
     "\n",
-    "REQ_PYTHON_VER=(3, 6)\n",
-    "REQ_MSTICPY_VER=(1, 4, 4)\n",
+    "REQ_PYTHON_VER = (3, 6)\n",
+    "REQ_MSTICPY_VER = (1, 4, 4)\n",
     "\n",
     "display(HTML(\"<h3>Starting Notebook setup...</h3>\"))\n",
     "\n",
@@ -369,11 +369,11 @@
    ],
    "source": [
     "# Specify the input log filename\n",
-    "logfile_name = './data/AWS_Honeybucket_Logs.txt'\n",
+    "logfile_name = \"./data/AWS_Honeybucket_Logs.txt\"\n",
+    "\n",
+    "with open(logfile_name) as f:\n",
+    "    input_logs = f.read()\n",
     "\n",
-    "with open(logfile_name, 'r') as f:\n",
-    "    input_logs= f.read()\n",
-    "    \n",
     "print(f\"Total no of lines in the log file: {len(input_logs)}\")\n",
     "\n",
     "# Display first 20 lines from a log file\n",
@@ -453,13 +453,13 @@
    ],
    "source": [
     "def clean_logfile(logfile_name):\n",
-    "    \"Function to spllit each alert and find and replace to create dictionary like key-value pairs\"\n",
+    "    \"\"\"Function to spllit each alert and find and replace to create dictionary like key-value pairs\"\"\"\n",
     "    print(\"Splitting individiual alerts...\")\n",
     "    s3log_records = re.split(\"AWS Request Details\", input_logs)\n",
     "    s3clean_logs = []\n",
     "    print(\"Find and replace the data into clean unified format...\")\n",
     "    # Excluding first and last event which are not access alerts\n",
-    "    for logs in s3log_records[1:-1]:       \n",
+    "    for logs in s3log_records[1:-1]:\n",
     "        logs = re.sub(\"Event Type\\n\", \"Event Type:\", logs)\n",
     "        logs = re.sub(\"Event Name\\n\", \"Event Name:\", logs)\n",
     "        logs = re.sub(\"Request ID\\n\", \"Request ID:\", logs)\n",
@@ -482,7 +482,7 @@
     "\n",
     "\n",
     "def create_dicts(clean_logfile):\n",
-    "    \"Function to create key value pairs and return list of json records\"\n",
+    "    \"\"\"Function to create key value pairs and return list of json records\"\"\"\n",
     "    list_of_json_records = []\n",
     "    print(\"\\nCreating dictionary pairs from clean dataset...\")\n",
     "    for event in clean_logfile:\n",
@@ -497,6 +497,7 @@
     "        list_of_json_records.append(parsed_dict)\n",
     "    return list_of_json_records\n",
     "\n",
+    "\n",
     "display(HTML(\"<h4>Cleaning log file and creating structured json file...</h4>\"))\n",
     "s3clean_logs = clean_logfile(logfile_name)\n",
     "list_of_json_records = create_dicts(s3clean_logs)\n",
@@ -689,7 +690,7 @@
     }
    ],
    "source": [
-    "#Load list of JSON records into dataframe\n",
+    "# Load list of JSON records into dataframe\n",
     "df = pd.DataFrame(list_of_json_records)\n",
     "\n",
     "df.head()"
@@ -1245,7 +1246,7 @@
     }
    ],
    "source": [
-    "select_ti = browse_results(ti_resp, severities=['high'])\n",
+    "select_ti = browse_results(ti_resp, severities=[\"high\"])\n",
     "select_ti"
    ]
   },
@@ -1325,19 +1326,15 @@
    "source": [
     "# Changing data type to datetime and formatting datetime objects\n",
     "datetime_format = \"%Y-%m-%d %H:%M:%S %Z\"\n",
-    "df_enriched[\"Event DateTime\"] = pd.to_datetime(\n",
-    "    df_enriched[\"Event DateTime\"]\n",
-    ")\n",
-    "df_enriched[\"Alarm DateTime\"] = pd.to_datetime(\n",
-    "    df_enriched[\"Alarm DateTime\"]\n",
-    ")\n",
+    "df_enriched[\"Event DateTime\"] = pd.to_datetime(df_enriched[\"Event DateTime\"])\n",
+    "df_enriched[\"Alarm DateTime\"] = pd.to_datetime(df_enriched[\"Alarm DateTime\"])\n",
     "\n",
     "\n",
     "# Sorting values and selecting first\n",
     "first_alert = df_enriched.sort_values(by=\"Alarm DateTime\").head(1)\n",
     "first_alert[\"Alarm DateTime\"] = first_alert[\"Alarm DateTime\"].dt.strftime(datetime_format)\n",
     "\n",
-    "#Filter columns to display\n",
+    "# Filter columns to display\n",
     "display_columns = [\n",
     "    \"Alarm DateTime\",\n",
     "    \"Source IP\",\n",
@@ -1346,7 +1343,7 @@
     "    \"Request User Agent\",\n",
     "]\n",
     "\n",
-    "#Display Alert\n",
+    "# Display Alert\n",
     "first_alert[display_columns]"
    ]
   },
@@ -1545,9 +1542,7 @@
     "\n",
     "df_enriched[\"MonthofYear\"] = df_enriched[\"Event DateTime\"].dt.strftime(\"%Y-%m\")\n",
     "\n",
-    "monthly_df = (\n",
-    "    df_enriched.groupby([\"MonthofYear\"])[\"MonthofYear\"].agg({\"count\"}).reset_index()\n",
-    ")\n",
+    "monthly_df = df_enriched.groupby([\"MonthofYear\"])[\"MonthofYear\"].agg({\"count\"}).reset_index()\n",
     "\n",
     "# Display data\n",
     "monthly_df"
@@ -1610,8 +1605,8 @@
    ],
    "source": [
     "print(\n",
-    "    f'''No of Unique Countries seen: {len(df_enriched['CountryName'].unique())} \n",
-    "        No of unique ASN : {len(df_enriched['SourceASN'].unique())}'''\n",
+    "    f\"\"\"No of Unique Countries seen: {len(df_enriched[\"CountryName\"].unique())} \n",
+    "        No of unique ASN : {len(df_enriched[\"SourceASN\"].unique())}\"\"\"\n",
     ")"
    ]
   },
@@ -1647,7 +1642,7 @@
     }
    ],
    "source": [
-    "df_enriched['Event Name'].value_counts()"
+    "df_enriched[\"Event Name\"].value_counts()"
    ]
   },
   {
@@ -1765,7 +1760,7 @@
     }
    ],
    "source": [
-    "df_enriched['User ID'].value_counts().reset_index()"
+    "df_enriched[\"User ID\"].value_counts().reset_index()"
    ]
   },
   {
@@ -1924,9 +1919,9 @@
     }
    ],
    "source": [
-    "pd.set_option('max_colwidth', 200)\n",
+    "pd.set_option(\"max_colwidth\", 200)\n",
     "\n",
-    "df_enriched.groupby(['Event Name','Request Parameters'])['Alert'].agg({'count'})"
+    "df_enriched.groupby([\"Event Name\", \"Request Parameters\"])[\"Alert\"].agg({\"count\"})"
    ]
   },
   {
@@ -1963,7 +1958,7 @@
     }
    ],
    "source": [
-    "df_enriched['Request User Agent'].value_counts().tail(5)"
+    "df_enriched[\"Request User Agent\"].value_counts().tail(5)"
    ]
   },
   {
@@ -2000,7 +1995,7 @@
     }
    ],
    "source": [
-    "df_enriched['CountryName'].value_counts().head(5)"
+    "df_enriched[\"CountryName\"].value_counts().head(5)"
    ]
   },
   {
@@ -2200,9 +2195,9 @@
    ],
    "source": [
     "# Repeat IP Addresses\n",
-    "df_grouped = df_enriched.groupby(['Source IP'])['Source IP'].agg({'count'})\n",
+    "df_grouped = df_enriched.groupby([\"Source IP\"])[\"Source IP\"].agg({\"count\"})\n",
     "\n",
-    "df_grouped[df_grouped['count'] > 1].sort_values(by='count', ascending=False)"
+    "df_grouped[df_grouped[\"count\"] > 1].sort_values(by=\"count\", ascending=False)"
    ]
   },
   {
@@ -2360,9 +2355,9 @@
    ],
    "source": [
     "# Repeat IP Addresses\n",
-    "df_asngrouped = df_enriched.groupby(['SourceASN'])['Source IP'].agg({'count'})\n",
+    "df_asngrouped = df_enriched.groupby([\"SourceASN\"])[\"Source IP\"].agg({\"count\"})\n",
     "\n",
-    "df_asngrouped[df_asngrouped['count'] > 1].sort_values(by='count', ascending=False)"
+    "df_asngrouped[df_asngrouped[\"count\"] > 1].sort_values(by=\"count\", ascending=False)"
    ]
   },
   {
@@ -2461,7 +2456,7 @@
     }
    ],
    "source": [
-    "ti_resp.groupby(['Severity', 'Provider'])['Ioc'].agg({'count'})"
+    "ti_resp.groupby([\"Severity\", \"Provider\"])[\"Ioc\"].agg({\"count\"})"
    ]
   },
   {
@@ -2505,14 +2500,15 @@
     }
    ],
    "source": [
-    "plt.rcParams[\"figure.figsize\"] = (12,6)\n",
-    "plt.style.use('seaborn-darkgrid')\n",
+    "plt.rcParams[\"figure.figsize\"] = (12, 6)\n",
+    "plt.style.use(\"seaborn-darkgrid\")\n",
     "\n",
     "fig, ax = plt.subplots()\n",
     "ax.plot(monthly_df[\"MonthofYear\"], monthly_df[\"count\"])\n",
     "\n",
-    "ax.set(xlabel='Per Month', ylabel='Count per each month',\n",
-    "       title='Monthly distribution of alerts')\n",
+    "ax.set(\n",
+    "    xlabel=\"Per Month\", ylabel=\"Count per each month\", title=\"Monthly distribution of alerts\"\n",
+    ")\n",
     "plt.xticks(rotation=60)\n",
     "\n",
     "plt.show()"
@@ -2550,9 +2546,7 @@
    ],
    "source": [
     "# Create dataset with count per Country\n",
-    "country_df = (\n",
-    "    df_enriched.groupby([\"CountryName\"])[\"CountryName\"].agg({\"count\"}).reset_index()\n",
-    ")\n",
+    "country_df = df_enriched.groupby([\"CountryName\"])[\"CountryName\"].agg({\"count\"}).reset_index()\n",
     "\n",
     "# normalize the count range to populate color pallette\n",
     "norm = matplotlib.colors.Normalize(\n",
@@ -2617,9 +2611,7 @@
    ],
    "source": [
     "# Repeat IP Addresses\n",
-    "df_asngrouped = df_enriched.groupby([\"SourceASN\", \"CountryName\"])[\"Alert\"].agg(\n",
-    "    {\"count\"}\n",
-    ")\n",
+    "df_asngrouped = df_enriched.groupby([\"SourceASN\", \"CountryName\"])[\"Alert\"].agg({\"count\"})\n",
     "\n",
     "# Filter records with count less than 4\n",
     "df2 = (\n",
@@ -2679,22 +2671,24 @@
     "\n",
     "folium_map = FoliumMap()\n",
     "\n",
+    "\n",
     "def format_ip_entity(row, ip_col):\n",
     "    ip_entity = entities.IpAddress(Address=row[ip_col])\n",
     "    iplocation.lookup_ip(ip_entity=ip_entity)\n",
     "    if \"severity\" in row:\n",
     "        ip_entity.AdditionalData[\"threat severity\"] = row[\"severity\"]\n",
     "    return ip_entity\n",
     "\n",
+    "\n",
     "# Filtering high and warning Ips to display on Geomap\n",
     "ti_resp_threats = ti_resp[ti_resp.Severity.isin([\"high\", \"warning\"])]\n",
     "\n",
     "ips_threats = list(ti_resp_threats.apply(lambda x: format_ip_entity(x, \"Ioc\"), axis=1))\n",
     "\n",
     "# Convert our IP addresses in string format into an ip address entity\n",
     "ip_entity = entities.IpAddress()\n",
-    "ip_list = [convert_to_ip_entities(i)[0] for i in ti_resp_threats['Ioc']]\n",
-    "    \n",
+    "ip_list = [convert_to_ip_entities(i)[0] for i in ti_resp_threats[\"Ioc\"]]\n",
+    "\n",
     "# Get center location of all IP locaitons to center the map on\n",
     "location = get_map_center(ip_list)\n",
     "s3bucket_map = FoliumMap(location=location, zoom_start=2)\n",
@@ -2703,7 +2697,7 @@
     "if len(ip_list) > 0:\n",
     "    icon_props = {\"color\": \"red\"}\n",
     "    s3bucket_map.add_ip_cluster(ip_entities=ips_threats, **icon_props)\n",
-    "    \n",
+    "\n",
     "display(s3bucket_map.folium_map)"
    ]
   },