Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"workbench.colorCustomizations": {
"terminal.background": "#00000000",
"minimap.background": "#00000000",
"scrollbar.shadow": "#00000000"
}
}
Binary file added ProjectMindmapv0.5.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file.
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ Common options:
- `--limit N`: process only the first N lines.
- `--format jsonl|csv`: output for enriched events (default: `jsonl`).
- `--color auto|always|never`: terminal color policy.
- `--ai-malicious-report`: after CTI summarization, ask the LLM for a detailed malicious-activity report (saved under `reports/`).

LLM request control:

Expand Down Expand Up @@ -69,6 +70,7 @@ Create a `.env` (see variables below). Keys are optional; the tool runs offline

- Enriched events: `data/processed/<name>.jsonl` (or `.csv` with `--format csv`).
- Reports: `data/processed/reports/report.txt` and `report.md` summarizing activity and suspicious IPs; may include a brief AI note if LLM is enabled.
- Malicious AI report (optional): `data/processed/reports/malicious_ai_report.txt|md` if `--ai-malicious-report` is used and malicious CTI signals are present.
- CTI cache: `data/cache/cti_cache.json` (auto‑created and reused to minimize network calls).

## Testing
Expand All @@ -80,6 +82,15 @@ Notes:
- If you used the local venv above, run tests via `.venv/bin/pytest -q`.
- A PyPDF2 deprecation warning may appear; it’s harmless and can be ignored.

## UI Dashboard

An optional Streamlit dashboard is included for exploration and client-friendly viewing.

- Install UI deps (already part of `requirements.txt`).
- Run the UI: `scripts/run_ui.sh` (or `streamlit run ui/app.py`).
- Select an enriched `.jsonl` file from `data/processed/` or upload one.
- View status distribution, sample enriched events, and CTI attributes.

## Troubleshooting

- `.txt` auto‑detection: the CLI reads a small sample and parses with `parse_line`. If none match, the file is copied as plain text rather than parsed as logs.
Expand Down
185 changes: 185 additions & 0 deletions data/processed-test/access_log.jsonl

Large diffs are not rendered by default.

15 changes: 15 additions & 0 deletions data/processed-test/new_log.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{"ip": "5.135.75.243", "time": "2025-09-04T06:22:10+00:00", "method": "GET", "path": "/sitemap.xml", "proto": "TLSv1.3", "status": 200, "size": null, "ref": null, "ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "severity": "unknown", "iocs": ["5.135.75.243"], "rationale": "LLM disabled"}
{"ip": "5.135.75.243", "time": "2025-09-04T06:22:10+00:00", "method": "GET", "path": "/sitemap.xml", "proto": "", "status": 301, "size": null, "ref": null, "ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "severity": "unknown", "iocs": ["5.135.75.243"], "rationale": "LLM disabled"}
{"ip": "5.135.75.243", "time": "2025-09-04T06:22:10+00:00", "method": "GET", "path": "/sitemap.xml", "proto": "TLSv1.3", "status": 200, "size": null, "ref": null, "ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:93.0) Gecko/20100101 Firefox/91.0", "severity": "unknown", "iocs": ["5.135.75.243"], "rationale": "LLM disabled"}
{"ip": "5.135.75.243", "time": "2025-09-04T06:22:10+00:00", "method": "GET", "path": "/sitemap.xml", "proto": "", "status": 301, "size": null, "ref": null, "ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:93.0) Gecko/20100101 Firefox/91.0", "severity": "unknown", "iocs": ["5.135.75.243"], "rationale": "LLM disabled"}
{"ip": "5.135.75.243", "time": "2025-09-04T06:22:10+00:00", "method": "GET", "path": "/sitemap.xml", "proto": "TLSv1.3", "status": 200, "size": null, "ref": null, "ua": "Mozilla/5.0 (iPhone; CPU iPhone OS 8_0_2 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12A366 Safari/600.1.4", "severity": "unknown", "iocs": ["5.135.75.243"], "rationale": "LLM disabled"}
{"ip": "5.135.75.243", "time": "2025-09-04T06:22:10+00:00", "method": "GET", "path": "/sitemap.xml", "proto": "", "status": 301, "size": null, "ref": null, "ua": "Mozilla/5.0 (iPhone; CPU iPhone OS 8_0_2 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12A366 Safari/600.1.4", "severity": "unknown", "iocs": ["5.135.75.243"], "rationale": "LLM disabled"}
{"ip": "5.135.75.243", "time": "2025-09-04T06:22:10+00:00", "method": "GET", "path": "/sitemap.xml", "proto": "TLSv1.3", "status": 200, "size": null, "ref": null, "ua": "Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7A341 Safari/528.16", "severity": "unknown", "iocs": ["5.135.75.243"], "rationale": "LLM disabled"}
{"ip": "5.135.75.243", "time": "2025-09-04T06:22:10+00:00", "method": "GET", "path": "/sitemap.xml", "proto": "", "status": 301, "size": null, "ref": null, "ua": "Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7A341 Safari/528.16", "severity": "unknown", "iocs": ["5.135.75.243"], "rationale": "LLM disabled"}
{"ip": "5.135.75.243", "time": "2025-09-04T06:22:10+00:00", "method": "GET", "path": "/sitemap.xml", "proto": "TLSv1.3", "status": 200, "size": null, "ref": null, "ua": "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)", "severity": "unknown", "iocs": ["5.135.75.243"], "rationale": "LLM disabled"}
{"ip": "5.135.75.243", "time": "2025-09-04T06:22:10+00:00", "method": "GET", "path": "/sitemap.xml", "proto": "", "status": 301, "size": null, "ref": null, "ua": "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)", "severity": "unknown", "iocs": ["5.135.75.243"], "rationale": "LLM disabled"}
{"ip": "5.135.75.243", "time": "2025-09-04T06:22:10+00:00", "method": "GET", "path": "/sitemap.xml", "proto": "TLSv1.3", "status": 200, "size": null, "ref": null, "ua": "msnbot/1.1 (+http://search.msn.com/msnbot.htm)", "severity": "unknown", "iocs": ["5.135.75.243"], "rationale": "LLM disabled"}
{"ip": "5.135.75.243", "time": "2025-09-04T06:22:10+00:00", "method": "GET", "path": "/robots.txt", "proto": "TLSv1.3", "status": 200, "size": null, "ref": null, "ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "severity": "unknown", "iocs": ["5.135.75.243"], "rationale": "LLM disabled"}
{"ip": "5.135.75.243", "time": "2025-09-04T06:22:10+00:00", "method": "GET", "path": "/robots.txt", "proto": "", "status": 301, "size": null, "ref": null, "ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "severity": "unknown", "iocs": ["5.135.75.243"], "rationale": "LLM disabled"}
{"ip": "5.135.75.243", "time": "2025-09-04T06:22:10+00:00", "method": "GET", "path": "/sitemap.xml", "proto": "", "status": 301, "size": null, "ref": null, "ua": "msnbot/1.1 (+http://search.msn.com/msnbot.htm)", "severity": "unknown", "iocs": ["5.135.75.243"], "rationale": "LLM disabled"}
{"ip": "5.135.75.243", "time": "2025-09-04T06:22:10+00:00", "method": "GET", "path": "/robots.txt", "proto": "TLSv1.3", "status": 200, "size": null, "ref": null, "ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:93.0) Gecko/20100101 Firefox/91.0", "severity": "unknown", "iocs": ["5.135.75.243"], "rationale": "LLM disabled"}
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,6 @@ pytest-cov>=5.0.0
rich>=13.7.1
uvloop; platform_system != 'Windows'
markdown>=3.6
streamlit>=1.34.0
pandas>=2.2.2
altair>=5.3.0
9 changes: 9 additions & 0 deletions scripts/run_ui.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/usr/bin/env bash
set -euo pipefail

if [ -d .venv ]; then
. .venv/bin/activate
fi

exec streamlit run ui/app.py

71 changes: 66 additions & 5 deletions src/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,13 @@
from .enrichers.llm_enricher import enrich_log_records
from .enrichers.cti_service import cti_for_ips
from .parsers.ua_analysis import detect_suspicious_user_agent
from .reports.report_builder import build_text_report, build_markdown_report
from .reports.report_builder import (
build_text_report,
build_markdown_report,
build_malicious_ai_report,
)
from .config import get_settings
from .groq_client import GroqRotatingClient


rich_traceback_install(show_locals=False)
Expand All @@ -46,6 +51,7 @@ def process_log(
cti_max: int | None = None,
cti_batch_size: int | None = None,
cti_batch_pause: float = 0.0,
ai_malicious_report: bool = False,
) -> Path:
console.rule("[bold cyan]🔎 Parsing Log")
console.log(f"Parsing log: [bold]{path}")
Expand Down Expand Up @@ -104,6 +110,63 @@ def process_log(
)
console.log(f"Reports saved: [bold]{txt_path}[/], [bold]{md_path}[/]")

# Optional: generate a detailed malicious activity report using LLM
if ai_malicious_report and use_llm and suspicious_rows:
try:

# Select IPs with strongest malicious indicators
def is_malicious(row: dict[str, object]) -> bool:
risk = str(row.get("risk", "unknown")).lower()
talos = str(row.get("talos_reputation", "")).lower()
vt_mal = int(row.get("vt_malicious") or 0)
vt_susp = int(row.get("vt_suspicious") or 0)
return (
risk in {"high"}
or talos in {"untrusted", "malicious"}
or vt_mal >= 1
or vt_susp >= 3
)

malicious = [r for r in suspicious_rows if is_malicious(r)]
if malicious:
# Derive minimal per-IP context from enriched events (top paths/UA)
from collections import Counter as _C
per_ip_paths: dict[str, list[tuple[str, int]]] = {}
per_ip_ua: dict[str, str] = {}
for ip in {str(r.get("ip")) for r in malicious}:
paths = _C([str(e.get("path")) for e in enriched if str(e.get("ip")) == ip and e.get("path")])
per_ip_paths[ip] = paths.most_common(5)
# pick any UA string observed
for e in enriched:
if str(e.get("ip")) == ip and (e.get("ua") or e.get("user_agent")):
per_ip_ua[ip] = str(e.get("ua") or e.get("user_agent"))
break
# Build prompt
insight_req = {
"malicious": malicious[:20], # cap to keep prompt small
"per_ip_top_paths": per_ip_paths,
"per_ip_ua": per_ip_ua,
}
client = GroqRotatingClient()
content = client.chat([
{
"role": "system",
"content": (
"You are a senior SOC analyst. Draft a concise but detailed incident note summarizing malicious "
"activity detected in logs corroborated by CTI (AbuseIPDB, Talos, VirusTotal). "
"Include: IP(s), CTI signals, notable paths, suspected TTPs, and recommended actions (blocking, WAF rules, triage). "
"Use clear sections and bullets."
),
},
{"role": "user", "content": json.dumps(insight_req)},
])
rpt_txt, rpt_md = build_malicious_ai_report(reports_dir, content)
console.log(f"Malicious AI report saved: [bold]{rpt_txt}[/], [bold]{rpt_md}[/]")
else:
console.log("[dim]No strong malicious CTI signals; skipping detailed AI report.")
except Exception as e: # pragma: no cover - network/env specific
console.log(f"[dim]Malicious AI report unavailable: {e}")

return out_path


Expand Down Expand Up @@ -234,8 +297,6 @@ def summarize_and_cti(
ai_insight: str | None = None
if use_llm:
try:
from .groq_client import GroqRotatingClient

client = GroqRotatingClient()
insight_req = {
"total_requests": total_requests,
Expand Down Expand Up @@ -264,8 +325,6 @@ def process_pdf(path: Path, out_dir: Path, use_llm: bool) -> Path:
out_path.write_text(text, encoding="utf-8")
# Optional: one-shot summary with LLM
if use_llm and text.strip():
from .groq_client import GroqRotatingClient

client = GroqRotatingClient()
summary = client.chat([
{"role": "system", "content": "Summarize the key findings in 5 bullets."},
Expand Down Expand Up @@ -347,6 +406,7 @@ def main(argv: List[str] | None = None) -> int:
parser.add_argument("--format", choices=["jsonl", "csv"], default="jsonl", help="Output format for logs")
parser.add_argument("--no-cti", action="store_true", help="Disable CTI lookups")
parser.add_argument("--no-reports", action="store_true", help="Do not build text/markdown reports")
parser.add_argument("--ai-malicious-report", action="store_true", help="Generate detailed AI report for malicious CTI signals")
parser.add_argument("--color", choices=["auto", "always", "never"], default="auto", help="Terminal color policy")
# LLM request controls
parser.add_argument("--llm-sample", type=int, default=200, help="Limit LLM calls by sampling this many groups (0=all)")
Expand Down Expand Up @@ -427,6 +487,7 @@ def _looks_like_log_file(p: Path, sample_lines: int = 200) -> bool:
cti_max=(None if args.cti_max in (None, 0) else max(0, int(args.cti_max))),
cti_batch_size=(None if getattr(args, 'cti_batch_size', 0) in (None, 0) else max(1, int(args.cti_batch_size))),
cti_batch_pause=float(getattr(args, 'cti_batch_pause', 0.0) or 0.0),
ai_malicious_report=bool(args.ai_malicious_report),
)
# Load enriched to drive summary/preview
enriched_records = [json.loads(l) for l in (out_dir / f"{path.stem}.jsonl").read_text(encoding="utf-8").splitlines()] if args.format == "jsonl" else None
Expand Down
72 changes: 72 additions & 0 deletions src/enrichers/cti_providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,22 @@ class AbuseIPDBResult:
url: str


@dataclass
class TalosResult:
ip: str
reputation: Optional[str]
owner: Optional[str]
url: str


@dataclass
class VirusTotalResult:
ip: str
malicious: Optional[int]
suspicious: Optional[int]
url: str


def fetch_abuseipdb(ip: str, timeout: float = 15.0) -> AbuseIPDBResult:
# Lazy imports to keep tests independent of optional deps
try:
Expand Down Expand Up @@ -78,3 +94,59 @@ def _extract_text(patterns):
country=country,
url=url,
)


def fetch_talos(ip: str, timeout: float = 15.0) -> TalosResult:
try:
import httpx # type: ignore
except Exception: # pragma: no cover
httpx = None # type: ignore
try:
from bs4 import BeautifulSoup # type: ignore
except Exception: # pragma: no cover
BeautifulSoup = None # type: ignore

url = f"https://talosintelligence.com/reputation_center/lookup?search={ip}"
if httpx is None or BeautifulSoup is None: # pragma: no cover
return TalosResult(ip=ip, reputation=None, owner=None, url=url)
try:
with httpx.Client(follow_redirects=True, timeout=timeout) as client:
resp = client.get(url)
resp.raise_for_status()
html = resp.text
except Exception: # pragma: no cover
return TalosResult(ip=ip, reputation=None, owner=None, url=url)
soup = BeautifulSoup(html, "html.parser")
text = soup.get_text(" ", strip=True)
rep = None
owner = None
# Heuristic patterns
m = re.search(r"Web Reputation\s*:?\s*([A-Za-z]+)", text, re.IGNORECASE)
if m:
rep = m.group(1).strip()
m = re.search(r"Owner\s*:?\s*([\w\s\-\.,]+)", text, re.IGNORECASE)
if m:
owner = m.group(1).strip()
return TalosResult(ip=ip, reputation=rep, owner=owner, url=url)


def fetch_virustotal(ip: str, api_key: Optional[str], timeout: float = 15.0) -> VirusTotalResult:
url = f"https://www.virustotal.com/api/v3/ip_addresses/{ip}"
if not api_key: # pragma: no cover
return VirusTotalResult(ip=ip, malicious=None, suspicious=None, url=url)
try:
import httpx # type: ignore
except Exception: # pragma: no cover
return VirusTotalResult(ip=ip, malicious=None, suspicious=None, url=url)
try:
with httpx.Client(timeout=timeout, headers={"x-apikey": api_key}) as client:
r = client.get(url)
if r.status_code >= 400:
return VirusTotalResult(ip=ip, malicious=None, suspicious=None, url=url)
data = r.json()
stats = data.get("data", {}).get("attributes", {}).get("last_analysis_stats", {})
mal = stats.get("malicious")
susp = stats.get("suspicious")
return VirusTotalResult(ip=ip, malicious=mal, suspicious=susp, url=url)
except Exception: # pragma: no cover
return VirusTotalResult(ip=ip, malicious=None, suspicious=None, url=url)
8 changes: 4 additions & 4 deletions src/parsers/ua_analysis.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

import re
from typing import List, Tuple
from typing import List, Tuple, Optional


SUSPICIOUS_AGENTS = [
Expand All @@ -19,12 +19,12 @@
]


def detect_suspicious_user_agent(ua: str | None) -> Tuple[bool, str | None]:
def detect_suspicious_user_agent(ua: Optional[str], patterns: Optional[List[str]] = None) -> Tuple[bool, Optional[str]]:
if not ua:
return False, None
ua_l = ua.lower()
for pat in SUSPICIOUS_AGENTS:
pats = patterns if patterns else SUSPICIOUS_AGENTS
for pat in pats:
if re.search(pat, ua_l):
return True, pat
return False, None

38 changes: 35 additions & 3 deletions src/reports/report_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ def build_markdown_report(
if not suspicious:
lines.append("No suspicious IPs identified.\n")
else:
lines.append(_md_row(["IP", "Risk", "Abuse Score", "Total Reports", "Country", "Requests", "4xx", "Suspicious UA", "One-line Explain"]))
lines.append(_md_row(["---"] * 9))
lines.append(_md_row(["IP", "Risk", "Abuse Score", "Total Reports", "Country", "Requests", "4xx", "Suspicious UA", "Talos", "VT (mal/susp)", "One-line Explain"]))
lines.append(_md_row(["---"] * 11))
for s in suspicious:
lines.append(
_md_row([
Expand All @@ -45,6 +45,8 @@ def build_markdown_report(
str(s.get("requests", "")),
str(s.get("errors_4xx", "")),
"yes" if s.get("ua_suspicious") else "no",
str(s.get("talos_reputation", "")),
f"{s.get('vt_malicious','')}/{s.get('vt_suspicious','')}",
str(s.get("ai_one_liner", "")),
])
)
Expand Down Expand Up @@ -80,11 +82,41 @@ def build_text_report(
lines.append(
f"- {s.get('ip')} | risk={s.get('risk')} | score={s.get('abuse_confidence_score')} | "
f"reports={s.get('total_reports')} | country={s.get('country')} | req={s.get('requests')} | "
f"4xx={s.get('errors_4xx')} | UA suspicious={'yes' if s.get('ua_suspicious') else 'no'}\n"
f"4xx={s.get('errors_4xx')} | UA suspicious={'yes' if s.get('ua_suspicious') else 'no'} | "
f"talos={s.get('talos_reputation')} | vt={s.get('vt_malicious')}/{s.get('vt_suspicious')}\n"
)
if s.get("ai_one_liner"):
lines.append(f" AI: {s.get('ai_one_liner')}\n")

path.write_text("".join(lines), encoding="utf-8")
return path


def build_malicious_ai_report(
out_dir: Path,
content: str,
*,
title: str = "Malicious Activity AI Report",
) -> tuple[Path, Path]:
"""Write a detailed AI-written malicious activity report to txt and md.

Returns: (txt_path, md_path)
"""
out_dir.mkdir(parents=True, exist_ok=True)
txt_path = out_dir / "malicious_ai_report.txt"
md_path = out_dir / "malicious_ai_report.md"

# Text version
lines_txt: List[str] = []
lines_txt.append(f"{title}\n")
lines_txt.append("=" * len(title) + "\n\n")
lines_txt.append(content.strip() + "\n")
txt_path.write_text("".join(lines_txt), encoding="utf-8")

# Markdown version
lines_md: List[str] = []
lines_md.append(f"# {title}\n\n")
lines_md.append(content.strip() + "\n")
md_path.write_text("".join(lines_md), encoding="utf-8")

return txt_path, md_path
Loading