Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions read_log.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,37 @@

import pandas as pd


def __get_bot_names(df, normalize=False):
"""
Get a frequency dictionary of bot requests
"""
pattern = re.compile(r"([A-Za-z]+bot)\b", re.IGNORECASE)
bot_names = {}

# Find all user agents that contain the word "bot" case-insensitive
mask = df["user_agent"].str.lower().str.contains("bot")

for agent in df[mask]["user_agent"]:
match = pattern.search(agent)
if match:
bot_name = match.group(1)
if bot_name not in bot_names:
bot_names[bot_name] = 0
else:
bot_names[bot_name] += 1

# Convert the numbers into percentages that add up to 1 if noted.
if normalize:
total = sum(bot_names.values())

for name in bot_names:
bot_names[name] = round(bot_names[name] / total, 5)

# Sort by the frequency in descending order
return dict(sorted(bot_names.items(), key=lambda item: item[1], reverse=True))


methods = ["GET", "HEAD", "POST", "PUT", "DELETE", "OPTIONS", "TRACE", "PATCH"]
pattern = r'^([\d.]+) - - \[([^]]+)\] "([^"]*)" (\d+) (\d+) "([^"]*)" "([^"]*)" "-"$'

Expand Down