|
276 | 276 | } |
277 | 277 | ], |
278 | 278 | "source": [ |
279 | | - "import json\n", |
280 | | - "import re\n", |
281 | 279 | "import pprint\n", |
282 | | - "from IPython.display import display, HTML\n", |
| 280 | + "import re\n", |
| 281 | + "\n", |
283 | 282 | "import matplotlib\n", |
284 | | - "import squarify\n", |
285 | 283 | "import matplotlib.pyplot as plt\n", |
| 284 | + "import squarify\n", |
| 285 | + "from IPython.display import HTML, display\n", |
286 | 286 | "\n", |
287 | 287 | "%matplotlib inline\n", |
288 | 288 | "\n", |
289 | | - "REQ_PYTHON_VER=(3, 6)\n", |
290 | | - "REQ_MSTICPY_VER=(1, 4, 4)\n", |
| 289 | + "REQ_PYTHON_VER = (3, 6)\n", |
| 290 | + "REQ_MSTICPY_VER = (1, 4, 4)\n", |
291 | 291 | "\n", |
292 | 292 | "display(HTML(\"<h3>Starting Notebook setup...</h3>\"))\n", |
293 | 293 | "\n", |
|
369 | 369 | ], |
370 | 370 | "source": [ |
371 | 371 | "# Specify the input log filename\n", |
372 | | - "logfile_name = './data/AWS_Honeybucket_Logs.txt'\n", |
| 372 | + "logfile_name = \"./data/AWS_Honeybucket_Logs.txt\"\n", |
| 373 | + "\n", |
| 374 | + "with open(logfile_name) as f:\n", |
| 375 | + " input_logs = f.read()\n", |
373 | 376 | "\n", |
374 | | - "with open(logfile_name, 'r') as f:\n", |
375 | | - " input_logs= f.read()\n", |
376 | | - " \n", |
377 | 377 | "print(f\"Total no of lines in the log file: {len(input_logs)}\")\n", |
378 | 378 | "\n", |
379 | 379 | "# Display first 20 lines from a log file\n", |
|
453 | 453 | ], |
454 | 454 | "source": [ |
455 | 455 | "def clean_logfile(logfile_name):\n", |
456 | | - " \"Function to spllit each alert and find and replace to create dictionary like key-value pairs\"\n", |
| 456 | + " \"\"\"Function to spllit each alert and find and replace to create dictionary like key-value pairs\"\"\"\n", |
457 | 457 | " print(\"Splitting individiual alerts...\")\n", |
458 | 458 | " s3log_records = re.split(\"AWS Request Details\", input_logs)\n", |
459 | 459 | " s3clean_logs = []\n", |
460 | 460 | " print(\"Find and replace the data into clean unified format...\")\n", |
461 | 461 | " # Excluding first and last event which are not access alerts\n", |
462 | | - " for logs in s3log_records[1:-1]: \n", |
| 462 | + " for logs in s3log_records[1:-1]:\n", |
463 | 463 | " logs = re.sub(\"Event Type\\n\", \"Event Type:\", logs)\n", |
464 | 464 | " logs = re.sub(\"Event Name\\n\", \"Event Name:\", logs)\n", |
465 | 465 | " logs = re.sub(\"Request ID\\n\", \"Request ID:\", logs)\n", |
|
482 | 482 | "\n", |
483 | 483 | "\n", |
484 | 484 | "def create_dicts(clean_logfile):\n", |
485 | | - " \"Function to create key value pairs and return list of json records\"\n", |
| 485 | + " \"\"\"Function to create key value pairs and return list of json records\"\"\"\n", |
486 | 486 | " list_of_json_records = []\n", |
487 | 487 | " print(\"\\nCreating dictionary pairs from clean dataset...\")\n", |
488 | 488 | " for event in clean_logfile:\n", |
|
497 | 497 | " list_of_json_records.append(parsed_dict)\n", |
498 | 498 | " return list_of_json_records\n", |
499 | 499 | "\n", |
| 500 | + "\n", |
500 | 501 | "display(HTML(\"<h4>Cleaning log file and creating structured json file...</h4>\"))\n", |
501 | 502 | "s3clean_logs = clean_logfile(logfile_name)\n", |
502 | 503 | "list_of_json_records = create_dicts(s3clean_logs)\n", |
|
689 | 690 | } |
690 | 691 | ], |
691 | 692 | "source": [ |
692 | | - "#Load list of JSON records into dataframe\n", |
| 693 | + "# Load list of JSON records into dataframe\n", |
693 | 694 | "df = pd.DataFrame(list_of_json_records)\n", |
694 | 695 | "\n", |
695 | 696 | "df.head()" |
|
1245 | 1246 | } |
1246 | 1247 | ], |
1247 | 1248 | "source": [ |
1248 | | - "select_ti = browse_results(ti_resp, severities=['high'])\n", |
| 1249 | + "select_ti = browse_results(ti_resp, severities=[\"high\"])\n", |
1249 | 1250 | "select_ti" |
1250 | 1251 | ] |
1251 | 1252 | }, |
|
1325 | 1326 | "source": [ |
1326 | 1327 | "# Changing data type to datetime and formatting datetime objects\n", |
1327 | 1328 | "datetime_format = \"%Y-%m-%d %H:%M:%S %Z\"\n", |
1328 | | - "df_enriched[\"Event DateTime\"] = pd.to_datetime(\n", |
1329 | | - " df_enriched[\"Event DateTime\"]\n", |
1330 | | - ")\n", |
1331 | | - "df_enriched[\"Alarm DateTime\"] = pd.to_datetime(\n", |
1332 | | - " df_enriched[\"Alarm DateTime\"]\n", |
1333 | | - ")\n", |
| 1329 | + "df_enriched[\"Event DateTime\"] = pd.to_datetime(df_enriched[\"Event DateTime\"])\n", |
| 1330 | + "df_enriched[\"Alarm DateTime\"] = pd.to_datetime(df_enriched[\"Alarm DateTime\"])\n", |
1334 | 1331 | "\n", |
1335 | 1332 | "\n", |
1336 | 1333 | "# Sorting values and selecting first\n", |
1337 | 1334 | "first_alert = df_enriched.sort_values(by=\"Alarm DateTime\").head(1)\n", |
1338 | 1335 | "first_alert[\"Alarm DateTime\"] = first_alert[\"Alarm DateTime\"].dt.strftime(datetime_format)\n", |
1339 | 1336 | "\n", |
1340 | | - "#Filter columns to display\n", |
| 1337 | + "# Filter columns to display\n", |
1341 | 1338 | "display_columns = [\n", |
1342 | 1339 | " \"Alarm DateTime\",\n", |
1343 | 1340 | " \"Source IP\",\n", |
|
1346 | 1343 | " \"Request User Agent\",\n", |
1347 | 1344 | "]\n", |
1348 | 1345 | "\n", |
1349 | | - "#Display Alert\n", |
| 1346 | + "# Display Alert\n", |
1350 | 1347 | "first_alert[display_columns]" |
1351 | 1348 | ] |
1352 | 1349 | }, |
|
1545 | 1542 | "\n", |
1546 | 1543 | "df_enriched[\"MonthofYear\"] = df_enriched[\"Event DateTime\"].dt.strftime(\"%Y-%m\")\n", |
1547 | 1544 | "\n", |
1548 | | - "monthly_df = (\n", |
1549 | | - " df_enriched.groupby([\"MonthofYear\"])[\"MonthofYear\"].agg({\"count\"}).reset_index()\n", |
1550 | | - ")\n", |
| 1545 | + "monthly_df = df_enriched.groupby([\"MonthofYear\"])[\"MonthofYear\"].agg({\"count\"}).reset_index()\n", |
1551 | 1546 | "\n", |
1552 | 1547 | "# Display data\n", |
1553 | 1548 | "monthly_df" |
|
1610 | 1605 | ], |
1611 | 1606 | "source": [ |
1612 | 1607 | "print(\n", |
1613 | | - " f'''No of Unique Countries seen: {len(df_enriched['CountryName'].unique())} \n", |
1614 | | - " No of unique ASN : {len(df_enriched['SourceASN'].unique())}'''\n", |
| 1608 | + " f\"\"\"No of Unique Countries seen: {len(df_enriched[\"CountryName\"].unique())} \n", |
| 1609 | + " No of unique ASN : {len(df_enriched[\"SourceASN\"].unique())}\"\"\"\n", |
1615 | 1610 | ")" |
1616 | 1611 | ] |
1617 | 1612 | }, |
|
1647 | 1642 | } |
1648 | 1643 | ], |
1649 | 1644 | "source": [ |
1650 | | - "df_enriched['Event Name'].value_counts()" |
| 1645 | + "df_enriched[\"Event Name\"].value_counts()" |
1651 | 1646 | ] |
1652 | 1647 | }, |
1653 | 1648 | { |
|
1765 | 1760 | } |
1766 | 1761 | ], |
1767 | 1762 | "source": [ |
1768 | | - "df_enriched['User ID'].value_counts().reset_index()" |
| 1763 | + "df_enriched[\"User ID\"].value_counts().reset_index()" |
1769 | 1764 | ] |
1770 | 1765 | }, |
1771 | 1766 | { |
|
1924 | 1919 | } |
1925 | 1920 | ], |
1926 | 1921 | "source": [ |
1927 | | - "pd.set_option('max_colwidth', 200)\n", |
| 1922 | + "pd.set_option(\"max_colwidth\", 200)\n", |
1928 | 1923 | "\n", |
1929 | | - "df_enriched.groupby(['Event Name','Request Parameters'])['Alert'].agg({'count'})" |
| 1924 | + "df_enriched.groupby([\"Event Name\", \"Request Parameters\"])[\"Alert\"].agg({\"count\"})" |
1930 | 1925 | ] |
1931 | 1926 | }, |
1932 | 1927 | { |
|
1963 | 1958 | } |
1964 | 1959 | ], |
1965 | 1960 | "source": [ |
1966 | | - "df_enriched['Request User Agent'].value_counts().tail(5)" |
| 1961 | + "df_enriched[\"Request User Agent\"].value_counts().tail(5)" |
1967 | 1962 | ] |
1968 | 1963 | }, |
1969 | 1964 | { |
|
2000 | 1995 | } |
2001 | 1996 | ], |
2002 | 1997 | "source": [ |
2003 | | - "df_enriched['CountryName'].value_counts().head(5)" |
| 1998 | + "df_enriched[\"CountryName\"].value_counts().head(5)" |
2004 | 1999 | ] |
2005 | 2000 | }, |
2006 | 2001 | { |
|
2200 | 2195 | ], |
2201 | 2196 | "source": [ |
2202 | 2197 | "# Repeat IP Addresses\n", |
2203 | | - "df_grouped = df_enriched.groupby(['Source IP'])['Source IP'].agg({'count'})\n", |
| 2198 | + "df_grouped = df_enriched.groupby([\"Source IP\"])[\"Source IP\"].agg({\"count\"})\n", |
2204 | 2199 | "\n", |
2205 | | - "df_grouped[df_grouped['count'] > 1].sort_values(by='count', ascending=False)" |
| 2200 | + "df_grouped[df_grouped[\"count\"] > 1].sort_values(by=\"count\", ascending=False)" |
2206 | 2201 | ] |
2207 | 2202 | }, |
2208 | 2203 | { |
|
2360 | 2355 | ], |
2361 | 2356 | "source": [ |
2362 | 2357 | "# Repeat IP Addresses\n", |
2363 | | - "df_asngrouped = df_enriched.groupby(['SourceASN'])['Source IP'].agg({'count'})\n", |
| 2358 | + "df_asngrouped = df_enriched.groupby([\"SourceASN\"])[\"Source IP\"].agg({\"count\"})\n", |
2364 | 2359 | "\n", |
2365 | | - "df_asngrouped[df_asngrouped['count'] > 1].sort_values(by='count', ascending=False)" |
| 2360 | + "df_asngrouped[df_asngrouped[\"count\"] > 1].sort_values(by=\"count\", ascending=False)" |
2366 | 2361 | ] |
2367 | 2362 | }, |
2368 | 2363 | { |
|
2461 | 2456 | } |
2462 | 2457 | ], |
2463 | 2458 | "source": [ |
2464 | | - "ti_resp.groupby(['Severity', 'Provider'])['Ioc'].agg({'count'})" |
| 2459 | + "ti_resp.groupby([\"Severity\", \"Provider\"])[\"Ioc\"].agg({\"count\"})" |
2465 | 2460 | ] |
2466 | 2461 | }, |
2467 | 2462 | { |
|
2505 | 2500 | } |
2506 | 2501 | ], |
2507 | 2502 | "source": [ |
2508 | | - "plt.rcParams[\"figure.figsize\"] = (12,6)\n", |
2509 | | - "plt.style.use('seaborn-darkgrid')\n", |
| 2503 | + "plt.rcParams[\"figure.figsize\"] = (12, 6)\n", |
| 2504 | + "plt.style.use(\"seaborn-darkgrid\")\n", |
2510 | 2505 | "\n", |
2511 | 2506 | "fig, ax = plt.subplots()\n", |
2512 | 2507 | "ax.plot(monthly_df[\"MonthofYear\"], monthly_df[\"count\"])\n", |
2513 | 2508 | "\n", |
2514 | | - "ax.set(xlabel='Per Month', ylabel='Count per each month',\n", |
2515 | | - " title='Monthly distribution of alerts')\n", |
| 2509 | + "ax.set(\n", |
| 2510 | + " xlabel=\"Per Month\", ylabel=\"Count per each month\", title=\"Monthly distribution of alerts\"\n", |
| 2511 | + ")\n", |
2516 | 2512 | "plt.xticks(rotation=60)\n", |
2517 | 2513 | "\n", |
2518 | 2514 | "plt.show()" |
|
2550 | 2546 | ], |
2551 | 2547 | "source": [ |
2552 | 2548 | "# Create dataset with count per Country\n", |
2553 | | - "country_df = (\n", |
2554 | | - " df_enriched.groupby([\"CountryName\"])[\"CountryName\"].agg({\"count\"}).reset_index()\n", |
2555 | | - ")\n", |
| 2549 | + "country_df = df_enriched.groupby([\"CountryName\"])[\"CountryName\"].agg({\"count\"}).reset_index()\n", |
2556 | 2550 | "\n", |
2557 | 2551 | "# normalize the count range to populate color pallette\n", |
2558 | 2552 | "norm = matplotlib.colors.Normalize(\n", |
|
2617 | 2611 | ], |
2618 | 2612 | "source": [ |
2619 | 2613 | "# Repeat IP Addresses\n", |
2620 | | - "df_asngrouped = df_enriched.groupby([\"SourceASN\", \"CountryName\"])[\"Alert\"].agg(\n", |
2621 | | - " {\"count\"}\n", |
2622 | | - ")\n", |
| 2614 | + "df_asngrouped = df_enriched.groupby([\"SourceASN\", \"CountryName\"])[\"Alert\"].agg({\"count\"})\n", |
2623 | 2615 | "\n", |
2624 | 2616 | "# Filter records with count less than 4\n", |
2625 | 2617 | "df2 = (\n", |
|
2679 | 2671 | "\n", |
2680 | 2672 | "folium_map = FoliumMap()\n", |
2681 | 2673 | "\n", |
| 2674 | + "\n", |
2682 | 2675 | "def format_ip_entity(row, ip_col):\n", |
2683 | 2676 | " ip_entity = entities.IpAddress(Address=row[ip_col])\n", |
2684 | 2677 | " iplocation.lookup_ip(ip_entity=ip_entity)\n", |
2685 | 2678 | " if \"severity\" in row:\n", |
2686 | 2679 | " ip_entity.AdditionalData[\"threat severity\"] = row[\"severity\"]\n", |
2687 | 2680 | " return ip_entity\n", |
2688 | 2681 | "\n", |
| 2682 | + "\n", |
2689 | 2683 | "# Filtering high and warning Ips to display on Geomap\n", |
2690 | 2684 | "ti_resp_threats = ti_resp[ti_resp.Severity.isin([\"high\", \"warning\"])]\n", |
2691 | 2685 | "\n", |
2692 | 2686 | "ips_threats = list(ti_resp_threats.apply(lambda x: format_ip_entity(x, \"Ioc\"), axis=1))\n", |
2693 | 2687 | "\n", |
2694 | 2688 | "# Convert our IP addresses in string format into an ip address entity\n", |
2695 | 2689 | "ip_entity = entities.IpAddress()\n", |
2696 | | - "ip_list = [convert_to_ip_entities(i)[0] for i in ti_resp_threats['Ioc']]\n", |
2697 | | - " \n", |
| 2690 | + "ip_list = [convert_to_ip_entities(i)[0] for i in ti_resp_threats[\"Ioc\"]]\n", |
| 2691 | + "\n", |
2698 | 2692 | "# Get center location of all IP locaitons to center the map on\n", |
2699 | 2693 | "location = get_map_center(ip_list)\n", |
2700 | 2694 | "s3bucket_map = FoliumMap(location=location, zoom_start=2)\n", |
|
2703 | 2697 | "if len(ip_list) > 0:\n", |
2704 | 2698 | " icon_props = {\"color\": \"red\"}\n", |
2705 | 2699 | " s3bucket_map.add_ip_cluster(ip_entities=ips_threats, **icon_props)\n", |
2706 | | - " \n", |
| 2700 | + "\n", |
2707 | 2701 | "display(s3bucket_map.folium_map)" |
2708 | 2702 | ] |
2709 | 2703 | }, |
|
0 commit comments