Azerbaijan-Cybersecurity-Center · pierringshot · Sep 4, 2025
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,7 @@
+{
+    "workbench.colorCustomizations": {
+        "terminal.background": "#00000000",
+        "minimap.background": "#00000000",
+        "scrollbar.shadow": "#00000000"
+    }
+}
diff --git a/ProjectMindmapv0.5.png b/ProjectMindmapv0.5.png
diff --git a/ProjectMindmapv0.5.png:Zone.Identifier b/ProjectMindmapv0.5.png:Zone.Identifier
diff --git a/README.md b/README.md
@@ -31,6 +31,7 @@ Common options:
 - `--limit N`: process only the first N lines.
 - `--format jsonl|csv`: output for enriched events (default: `jsonl`).
 - `--color auto|always|never`: terminal color policy.
+- `--ai-malicious-report`: after CTI summarization, ask the LLM for a detailed malicious-activity report (saved under `reports/`).
 
 LLM request control:
 
@@ -69,6 +70,7 @@ Create a `.env` (see variables below). Keys are optional; the tool runs offline
 
 - Enriched events: `data/processed/<name>.jsonl` (or `.csv` with `--format csv`).
 - Reports: `data/processed/reports/report.txt` and `report.md` summarizing activity and suspicious IPs; may include a brief AI note if LLM is enabled.
+- Malicious AI report (optional): `data/processed/reports/malicious_ai_report.txt|md` if `--ai-malicious-report` is used and malicious CTI signals are present.
 - CTI cache: `data/cache/cti_cache.json` (auto‑created and reused to minimize network calls).
 
 ## Testing
@@ -80,6 +82,15 @@ Notes:
 - If you used the local venv above, run tests via `.venv/bin/pytest -q`.
 - A PyPDF2 deprecation warning may appear; it’s harmless and can be ignored.
 
+## UI Dashboard
+
+An optional Streamlit dashboard is included for exploration and client-friendly viewing.
+
+- Install UI deps (already part of `requirements.txt`).
+- Run the UI: `scripts/run_ui.sh` (or `streamlit run ui/app.py`).
+- Select an enriched `.jsonl` file from `data/processed/` or upload one.
+- View status distribution, sample enriched events, and CTI attributes.
+
 ## Troubleshooting
 
 - `.txt` auto‑detection: the CLI reads a small sample and parses with `parse_line`. If none match, the file is copied as plain text rather than parsed as logs.

diff --git a/data/processed-test/access_log.jsonl b/data/processed-test/access_log.jsonl
diff --git a/data/processed-test/new_log.jsonl b/data/processed-test/new_log.jsonl
@@ -0,0 +1,15 @@
+{"ip": "5.135.75.243", "time": "2025-09-04T06:22:10+00:00", "method": "GET", "path": "/sitemap.xml", "proto": "TLSv1.3", "status": 200, "size": null, "ref": null, "ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "severity": "unknown", "iocs": ["5.135.75.243"], "rationale": "LLM disabled"}
+{"ip": "5.135.75.243", "time": "2025-09-04T06:22:10+00:00", "method": "GET", "path": "/sitemap.xml", "proto": "", "status": 301, "size": null, "ref": null, "ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "severity": "unknown", "iocs": ["5.135.75.243"], "rationale": "LLM disabled"}
+{"ip": "5.135.75.243", "time": "2025-09-04T06:22:10+00:00", "method": "GET", "path": "/sitemap.xml", "proto": "TLSv1.3", "status": 200, "size": null, "ref": null, "ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:93.0) Gecko/20100101 Firefox/91.0", "severity": "unknown", "iocs": ["5.135.75.243"], "rationale": "LLM disabled"}
+{"ip": "5.135.75.243", "time": "2025-09-04T06:22:10+00:00", "method": "GET", "path": "/sitemap.xml", "proto": "", "status": 301, "size": null, "ref": null, "ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:93.0) Gecko/20100101 Firefox/91.0", "severity": "unknown", "iocs": ["5.135.75.243"], "rationale": "LLM disabled"}
+{"ip": "5.135.75.243", "time": "2025-09-04T06:22:10+00:00", "method": "GET", "path": "/sitemap.xml", "proto": "TLSv1.3", "status": 200, "size": null, "ref": null, "ua": "Mozilla/5.0 (iPhone; CPU iPhone OS 8_0_2 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12A366 Safari/600.1.4", "severity": "unknown", "iocs": ["5.135.75.243"], "rationale": "LLM disabled"}
+{"ip": "5.135.75.243", "time": "2025-09-04T06:22:10+00:00", "method": "GET", "path": "/sitemap.xml", "proto": "", "status": 301, "size": null, "ref": null, "ua": "Mozilla/5.0 (iPhone; CPU iPhone OS 8_0_2 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12A366 Safari/600.1.4", "severity": "unknown", "iocs": ["5.135.75.243"], "rationale": "LLM disabled"}
+{"ip": "5.135.75.243", "time": "2025-09-04T06:22:10+00:00", "method": "GET", "path": "/sitemap.xml", "proto": "TLSv1.3", "status": 200, "size": null, "ref": null, "ua": "Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7A341 Safari/528.16", "severity": "unknown", "iocs": ["5.135.75.243"], "rationale": "LLM disabled"}
+{"ip": "5.135.75.243", "time": "2025-09-04T06:22:10+00:00", "method": "GET", "path": "/sitemap.xml", "proto": "", "status": 301, "size": null, "ref": null, "ua": "Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7A341 Safari/528.16", "severity": "unknown", "iocs": ["5.135.75.243"], "rationale": "LLM disabled"}
+{"ip": "5.135.75.243", "time": "2025-09-04T06:22:10+00:00", "method": "GET", "path": "/sitemap.xml", "proto": "TLSv1.3", "status": 200, "size": null, "ref": null, "ua": "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)", "severity": "unknown", "iocs": ["5.135.75.243"], "rationale": "LLM disabled"}
+{"ip": "5.135.75.243", "time": "2025-09-04T06:22:10+00:00", "method": "GET", "path": "/sitemap.xml", "proto": "", "status": 301, "size": null, "ref": null, "ua": "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)", "severity": "unknown", "iocs": ["5.135.75.243"], "rationale": "LLM disabled"}
+{"ip": "5.135.75.243", "time": "2025-09-04T06:22:10+00:00", "method": "GET", "path": "/sitemap.xml", "proto": "TLSv1.3", "status": 200, "size": null, "ref": null, "ua": "msnbot/1.1 (+http://search.msn.com/msnbot.htm)", "severity": "unknown", "iocs": ["5.135.75.243"], "rationale": "LLM disabled"}
+{"ip": "5.135.75.243", "time": "2025-09-04T06:22:10+00:00", "method": "GET", "path": "/robots.txt", "proto": "TLSv1.3", "status": 200, "size": null, "ref": null, "ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "severity": "unknown", "iocs": ["5.135.75.243"], "rationale": "LLM disabled"}
+{"ip": "5.135.75.243", "time": "2025-09-04T06:22:10+00:00", "method": "GET", "path": "/robots.txt", "proto": "", "status": 301, "size": null, "ref": null, "ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "severity": "unknown", "iocs": ["5.135.75.243"], "rationale": "LLM disabled"}
+{"ip": "5.135.75.243", "time": "2025-09-04T06:22:10+00:00", "method": "GET", "path": "/sitemap.xml", "proto": "", "status": 301, "size": null, "ref": null, "ua": "msnbot/1.1 (+http://search.msn.com/msnbot.htm)", "severity": "unknown", "iocs": ["5.135.75.243"], "rationale": "LLM disabled"}
+{"ip": "5.135.75.243", "time": "2025-09-04T06:22:10+00:00", "method": "GET", "path": "/robots.txt", "proto": "TLSv1.3", "status": 200, "size": null, "ref": null, "ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:93.0) Gecko/20100101 Firefox/91.0", "severity": "unknown", "iocs": ["5.135.75.243"], "rationale": "LLM disabled"}
diff --git a/requirements.txt b/requirements.txt
@@ -11,3 +11,6 @@ pytest-cov>=5.0.0
 rich>=13.7.1
 uvloop; platform_system != 'Windows'
 markdown>=3.6
+streamlit>=1.34.0
+pandas>=2.2.2
+altair>=5.3.0
diff --git a/scripts/run_ui.sh b/scripts/run_ui.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+if [ -d .venv ]; then
+  . .venv/bin/activate
+fi
+
+exec streamlit run ui/app.py
+
diff --git a/src/cli.py b/src/cli.py
@@ -20,8 +20,13 @@
 from .enrichers.llm_enricher import enrich_log_records
 from .enrichers.cti_service import cti_for_ips
 from .parsers.ua_analysis import detect_suspicious_user_agent
-from .reports.report_builder import build_text_report, build_markdown_report
+from .reports.report_builder import (
+    build_text_report,
+    build_markdown_report,
+    build_malicious_ai_report,
+)
 from .config import get_settings
+from .groq_client import GroqRotatingClient
 
 
 rich_traceback_install(show_locals=False)
@@ -46,6 +51,7 @@ def process_log(
     cti_max: int | None = None,
     cti_batch_size: int | None = None,
     cti_batch_pause: float = 0.0,
+    ai_malicious_report: bool = False,
 ) -> Path:
     console.rule("[bold cyan]🔎 Parsing Log")
     console.log(f"Parsing log: [bold]{path}")
@@ -104,6 +110,63 @@ def process_log(
         )
         console.log(f"Reports saved: [bold]{txt_path}[/], [bold]{md_path}[/]")
 
+        # Optional: generate a detailed malicious activity report using LLM
+        if ai_malicious_report and use_llm and suspicious_rows:
+            try:
+
+                # Select IPs with strongest malicious indicators
+                def is_malicious(row: dict[str, object]) -> bool:
+                    risk = str(row.get("risk", "unknown")).lower()
+                    talos = str(row.get("talos_reputation", "")).lower()
+                    vt_mal = int(row.get("vt_malicious") or 0)
+                    vt_susp = int(row.get("vt_suspicious") or 0)
+                    return (
+                        risk in {"high"}
+                        or talos in {"untrusted", "malicious"}
+                        or vt_mal >= 1
+                        or vt_susp >= 3
+                    )
+
+                malicious = [r for r in suspicious_rows if is_malicious(r)]
+                if malicious:
+                    # Derive minimal per-IP context from enriched events (top paths/UA)
+                    from collections import Counter as _C
+                    per_ip_paths: dict[str, list[tuple[str, int]]] = {}
+                    per_ip_ua: dict[str, str] = {}
+                    for ip in {str(r.get("ip")) for r in malicious}:
+                        paths = _C([str(e.get("path")) for e in enriched if str(e.get("ip")) == ip and e.get("path")])
+                        per_ip_paths[ip] = paths.most_common(5)
+                        # pick any UA string observed
+                        for e in enriched:
+                            if str(e.get("ip")) == ip and (e.get("ua") or e.get("user_agent")):
+                                per_ip_ua[ip] = str(e.get("ua") or e.get("user_agent"))
+                                break
+                    # Build prompt
+                    insight_req = {
+                        "malicious": malicious[:20],  # cap to keep prompt small
+                        "per_ip_top_paths": per_ip_paths,
+                        "per_ip_ua": per_ip_ua,
+                    }
+                    client = GroqRotatingClient()
+                    content = client.chat([
+                        {
+                            "role": "system",
+                            "content": (
+                                "You are a senior SOC analyst. Draft a concise but detailed incident note summarizing malicious "
+                                "activity detected in logs corroborated by CTI (AbuseIPDB, Talos, VirusTotal). "
+                                "Include: IP(s), CTI signals, notable paths, suspected TTPs, and recommended actions (blocking, WAF rules, triage). "
+                                "Use clear sections and bullets."
+                            ),
+                        },
+                        {"role": "user", "content": json.dumps(insight_req)},
+                    ])
+                    rpt_txt, rpt_md = build_malicious_ai_report(reports_dir, content)
+                    console.log(f"Malicious AI report saved: [bold]{rpt_txt}[/], [bold]{rpt_md}[/]")
+                else:
+                    console.log("[dim]No strong malicious CTI signals; skipping detailed AI report.")
+            except Exception as e:  # pragma: no cover - network/env specific
+                console.log(f"[dim]Malicious AI report unavailable: {e}")
+
     return out_path
 
 
@@ -234,8 +297,6 @@ def summarize_and_cti(
     ai_insight: str | None = None
     if use_llm:
         try:
-            from .groq_client import GroqRotatingClient
-
             client = GroqRotatingClient()
             insight_req = {
                 "total_requests": total_requests,
@@ -264,8 +325,6 @@ def process_pdf(path: Path, out_dir: Path, use_llm: bool) -> Path:
     out_path.write_text(text, encoding="utf-8")
     # Optional: one-shot summary with LLM
     if use_llm and text.strip():
-        from .groq_client import GroqRotatingClient
-
         client = GroqRotatingClient()
         summary = client.chat([
             {"role": "system", "content": "Summarize the key findings in 5 bullets."},
@@ -347,6 +406,7 @@ def main(argv: List[str] | None = None) -> int:
     parser.add_argument("--format", choices=["jsonl", "csv"], default="jsonl", help="Output format for logs")
     parser.add_argument("--no-cti", action="store_true", help="Disable CTI lookups")
     parser.add_argument("--no-reports", action="store_true", help="Do not build text/markdown reports")
+    parser.add_argument("--ai-malicious-report", action="store_true", help="Generate detailed AI report for malicious CTI signals")
     parser.add_argument("--color", choices=["auto", "always", "never"], default="auto", help="Terminal color policy")
     # LLM request controls
     parser.add_argument("--llm-sample", type=int, default=200, help="Limit LLM calls by sampling this many groups (0=all)")
@@ -427,6 +487,7 @@ def _looks_like_log_file(p: Path, sample_lines: int = 200) -> bool:
             cti_max=(None if args.cti_max in (None, 0) else max(0, int(args.cti_max))),
             cti_batch_size=(None if getattr(args, 'cti_batch_size', 0) in (None, 0) else max(1, int(args.cti_batch_size))),
             cti_batch_pause=float(getattr(args, 'cti_batch_pause', 0.0) or 0.0),
+            ai_malicious_report=bool(args.ai_malicious_report),
         )
         # Load enriched to drive summary/preview
         enriched_records = [json.loads(l) for l in (out_dir / f"{path.stem}.jsonl").read_text(encoding="utf-8").splitlines()] if args.format == "jsonl" else None

diff --git a/src/enrichers/cti_providers.py b/src/enrichers/cti_providers.py
@@ -14,6 +14,22 @@ class AbuseIPDBResult:
     url: str
 
 
+@dataclass
+class TalosResult:
+    ip: str
+    reputation: Optional[str]
+    owner: Optional[str]
+    url: str
+
+
+@dataclass
+class VirusTotalResult:
+    ip: str
+    malicious: Optional[int]
+    suspicious: Optional[int]
+    url: str
+
+
 def fetch_abuseipdb(ip: str, timeout: float = 15.0) -> AbuseIPDBResult:
     # Lazy imports to keep tests independent of optional deps
     try:
@@ -78,3 +94,59 @@ def _extract_text(patterns):
         country=country,
         url=url,
     )
+
+
+def fetch_talos(ip: str, timeout: float = 15.0) -> TalosResult:
+    try:
+        import httpx  # type: ignore
+    except Exception:  # pragma: no cover
+        httpx = None  # type: ignore
+    try:
+        from bs4 import BeautifulSoup  # type: ignore
+    except Exception:  # pragma: no cover
+        BeautifulSoup = None  # type: ignore
+
+    url = f"https://talosintelligence.com/reputation_center/lookup?search={ip}"
+    if httpx is None or BeautifulSoup is None:  # pragma: no cover
+        return TalosResult(ip=ip, reputation=None, owner=None, url=url)
+    try:
+        with httpx.Client(follow_redirects=True, timeout=timeout) as client:
+            resp = client.get(url)
+            resp.raise_for_status()
+            html = resp.text
+    except Exception:  # pragma: no cover
+        return TalosResult(ip=ip, reputation=None, owner=None, url=url)
+    soup = BeautifulSoup(html, "html.parser")
+    text = soup.get_text(" ", strip=True)
+    rep = None
+    owner = None
+    # Heuristic patterns
+    m = re.search(r"Web Reputation\s*:?\s*([A-Za-z]+)", text, re.IGNORECASE)
+    if m:
+        rep = m.group(1).strip()
+    m = re.search(r"Owner\s*:?\s*([\w\s\-\.,]+)", text, re.IGNORECASE)
+    if m:
+        owner = m.group(1).strip()
+    return TalosResult(ip=ip, reputation=rep, owner=owner, url=url)
+
+
+def fetch_virustotal(ip: str, api_key: Optional[str], timeout: float = 15.0) -> VirusTotalResult:
+    url = f"https://www.virustotal.com/api/v3/ip_addresses/{ip}"
+    if not api_key:  # pragma: no cover
+        return VirusTotalResult(ip=ip, malicious=None, suspicious=None, url=url)
+    try:
+        import httpx  # type: ignore
+    except Exception:  # pragma: no cover
+        return VirusTotalResult(ip=ip, malicious=None, suspicious=None, url=url)
+    try:
+        with httpx.Client(timeout=timeout, headers={"x-apikey": api_key}) as client:
+            r = client.get(url)
+            if r.status_code >= 400:
+                return VirusTotalResult(ip=ip, malicious=None, suspicious=None, url=url)
+            data = r.json()
+            stats = data.get("data", {}).get("attributes", {}).get("last_analysis_stats", {})
+            mal = stats.get("malicious")
+            susp = stats.get("suspicious")
+            return VirusTotalResult(ip=ip, malicious=mal, suspicious=susp, url=url)
+    except Exception:  # pragma: no cover
+        return VirusTotalResult(ip=ip, malicious=None, suspicious=None, url=url)
diff --git a/src/parsers/ua_analysis.py b/src/parsers/ua_analysis.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import re
-from typing import List, Tuple
+from typing import List, Tuple, Optional
 
 
 SUSPICIOUS_AGENTS = [
@@ -19,12 +19,12 @@
 ]
 
 
-def detect_suspicious_user_agent(ua: str | None) -> Tuple[bool, str | None]:
+def detect_suspicious_user_agent(ua: Optional[str], patterns: Optional[List[str]] = None) -> Tuple[bool, Optional[str]]:
     if not ua:
         return False, None
     ua_l = ua.lower()
-    for pat in SUSPICIOUS_AGENTS:
+    pats = patterns if patterns else SUSPICIOUS_AGENTS
+    for pat in pats:
         if re.search(pat, ua_l):
             return True, pat
     return False, None
-
diff --git a/src/reports/report_builder.py b/src/reports/report_builder.py
@@ -32,8 +32,8 @@ def build_markdown_report(
     if not suspicious:
         lines.append("No suspicious IPs identified.\n")
     else:
-        lines.append(_md_row(["IP", "Risk", "Abuse Score", "Total Reports", "Country", "Requests", "4xx", "Suspicious UA", "One-line Explain"]))
-        lines.append(_md_row(["---"] * 9))
+        lines.append(_md_row(["IP", "Risk", "Abuse Score", "Total Reports", "Country", "Requests", "4xx", "Suspicious UA", "Talos", "VT (mal/susp)", "One-line Explain"]))
+        lines.append(_md_row(["---"] * 11))
         for s in suspicious:
             lines.append(
                 _md_row([
@@ -45,6 +45,8 @@ def build_markdown_report(
                     str(s.get("requests", "")),
                     str(s.get("errors_4xx", "")),
                     "yes" if s.get("ua_suspicious") else "no",
+                    str(s.get("talos_reputation", "")),
+                    f"{s.get('vt_malicious','')}/{s.get('vt_suspicious','')}",
                     str(s.get("ai_one_liner", "")),
                 ])
             )
@@ -80,11 +82,41 @@ def build_text_report(
             lines.append(
                 f"- {s.get('ip')} | risk={s.get('risk')} | score={s.get('abuse_confidence_score')} | "
                 f"reports={s.get('total_reports')} | country={s.get('country')} | req={s.get('requests')} | "
-                f"4xx={s.get('errors_4xx')} | UA suspicious={'yes' if s.get('ua_suspicious') else 'no'}\n"
+                f"4xx={s.get('errors_4xx')} | UA suspicious={'yes' if s.get('ua_suspicious') else 'no'} | "
+                f"talos={s.get('talos_reputation')} | vt={s.get('vt_malicious')}/{s.get('vt_suspicious')}\n"
             )
             if s.get("ai_one_liner"):
                 lines.append(f"  AI: {s.get('ai_one_liner')}\n")
 
     path.write_text("".join(lines), encoding="utf-8")
     return path
 
+
+def build_malicious_ai_report(
+    out_dir: Path,
+    content: str,
+    *,
+    title: str = "Malicious Activity AI Report",
+) -> tuple[Path, Path]:
+    """Write a detailed AI-written malicious activity report to txt and md.
+
+    Returns: (txt_path, md_path)
+    """
+    out_dir.mkdir(parents=True, exist_ok=True)
+    txt_path = out_dir / "malicious_ai_report.txt"
+    md_path = out_dir / "malicious_ai_report.md"
+
+    # Text version
+    lines_txt: List[str] = []
+    lines_txt.append(f"{title}\n")
+    lines_txt.append("=" * len(title) + "\n\n")
+    lines_txt.append(content.strip() + "\n")
+    txt_path.write_text("".join(lines_txt), encoding="utf-8")
+
+    # Markdown version
+    lines_md: List[str] = []
+    lines_md.append(f"# {title}\n\n")
+    lines_md.append(content.strip() + "\n")
+    md_path.write_text("".join(lines_md), encoding="utf-8")
+
+    return txt_path, md_path