Skip to content

Commit 0df23bf

Browse files
committed
Fix dnstwist script
1 parent 5468e42 commit 0df23bf

File tree

12 files changed

+533
-126
lines changed

12 files changed

+533
-126
lines changed

.dnstwist-cache/seen.json

Lines changed: 0 additions & 1 deletion
This file was deleted.

.github/scripts/build_findings.js

Lines changed: 30 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
#!/usr/bin/env node
2-
// Build findings: merge Phase A/B + HTTP diagnostics + RDAP, and output only
3-
// domains that actually load (if REPORT_ONLY_LIVE=1).
4-
// LIVE means: any host {domain, www.domain} at / or probed path returns 200 with non-tiny body.
5-
// Adds reasons: IMG, HTML, NAME, KEYWORD, has-MX, age≤Nd, LIVE, PATH, parked-ns.
2+
// Build findings: merge Phase A/B + HTTP diagnostics + RDAP + (optional) snapshot similarity.
3+
// Outputs only domains that meet score threshold (and LIVE-only if REPORT_ONLY_LIVE=1).
64

75
import fs from "node:fs";
86
import { promises as fsp } from "node:fs";
97

8+
// ---- knobs (via env) ----
109
const lshThr = parseInt(process.env.LSH_THRESHOLD || "80", 10);
1110
const phashThr = parseInt(process.env.PHASH_THRESHOLD|| "90", 10);
1211
const minScore = parseInt(process.env.MIN_SCORE || "3", 10);
@@ -15,11 +14,12 @@ const NEW_DOMAIN_WINDOW_DAYS = parseInt(process.env.NEW_DOMAIN_WINDOW_DAYS || "1
1514
const REPORT_ONLY_LIVE = (process.env.REPORT_ONLY_LIVE || "0") === "1";
1615
const BODY_MIN = parseInt(process.env.BODY_MIN || "1500", 10); // "has content"
1716

17+
// secrets/inputs
1818
const risky = (process.env.RISKY_KEYWORDS || "").split(",").map(w=>w.trim().toLowerCase()).filter(Boolean);
1919
const allow = (process.env.ALLOWLIST || "").split(",").map(w=>w.trim().toLowerCase()).filter(Boolean);
20-
const seeds = (process.env.WATCH_DOMAINS || "relay.link").split(",").map(s=>s.trim().toLowerCase()).filter(Boolean);
21-
const seedLabels = seeds.map(s => s.split(".")[0]);
20+
const seeds = (process.env.WATCH_DOMAINS || "").split(",").map(s=>s.trim().toLowerCase()).filter(Boolean);
2221

22+
// ---- helpers ----
2323
function indexBy(rows){
2424
const m=new Map();
2525
for (const r of rows||[]) {
@@ -40,16 +40,25 @@ function lev(a,b){
4040
}
4141
return M[a.length][b.length];
4242
}
43+
const seedLabels = seeds.map(sld);
44+
const allowSet = new Set([...allow, ...seeds]); // seeds implicitly allowed
45+
const isAllowed = (dom) => {
46+
const s = (dom || "").toLowerCase();
47+
for (const base of allowSet) {
48+
if (s === base || s.endsWith("."+base)) return true;
49+
}
50+
return false;
51+
};
4352
const isRiskyName = (dom) => { const s=(dom||"").toLowerCase(); return risky.some(k=>s.includes(k)); };
44-
const isAllowed = (dom) => { const s=(dom||"").toLowerCase(); return allow.some(k => s===k || s.endsWith("."+k)); };
4553
const nameClose = (dom) => seedLabels.some(L => lev(sld(dom), L) <= NAME_EDIT_MAX);
4654

55+
// ---- HTTP diagnostics helpers ----
4756
function diagChecks(diag) {
4857
const checks = [];
4958
if (!diag) return checks;
5059
if (Array.isArray(diag.results) && diag.results.length) {
5160
for (const r of diag.results) {
52-
if (r.http) checks.push({ kind:"root-http", ...r.http });
61+
if (r.http) checks.push({ kind:"root-http", ...r.http });
5362
if (r.https) checks.push({ kind:"root-https", ...r.https });
5463
for (const p of (r.probes||[])) checks.push({ kind:`probe${p.path||""}`, ...p });
5564
}
@@ -64,19 +73,20 @@ function diagHasLive(diag) {
6473
return diagChecks(diag).some(x => x && x.status === 200 && (x.length||0) >= BODY_MIN);
6574
}
6675
function diagHasLivePath(diag) {
67-
// true if the 200 came from a probed subpath (not root)
6876
return diagChecks(diag).some(x =>
6977
x && x.status === 200 && (x.length||0) >= BODY_MIN && String(x.kind||"").startsWith("probe")
7078
);
7179
}
7280

81+
// ---- main ----
7382
(async () => {
7483
const A = fs.existsSync("phaseA_results.json") ? JSON.parse(await fsp.readFile("phaseA_results.json","utf8")) : [];
7584
const B = fs.existsSync("phaseB_results.json") ? JSON.parse(await fsp.readFile("phaseB_results.json","utf8")) : [];
7685
const H = fs.existsSync("http_diagnostics.json") ? JSON.parse(await fsp.readFile("http_diagnostics.json","utf8")) : [];
7786
const R = fs.existsSync("rdap_enrich.json") ? JSON.parse(await fsp.readFile("rdap_enrich.json","utf8")) : [];
87+
const S = fs.existsSync("snap_similarity.json") ? JSON.parse(await fsp.readFile("snap_similarity.json","utf8")) : {};
7888

79-
console.log(`Inputs: PhaseA=${A.length} PhaseB=${B.length} HTTPdiag=${H.length} RDAP=${R.length} REPORT_ONLY_LIVE=${REPORT_ONLY_LIVE}`);
89+
console.log(`Inputs: PhaseA=${A.length} PhaseB=${B.length} HTTPdiag=${H.length} RDAP=${R.length} SnapSim=${Object.keys(S||{}).length} REPORT_ONLY_LIVE=${REPORT_ONLY_LIVE}`);
8090

8191
const mapAB = indexBy(A);
8292
for (const [k,v] of indexBy(B).entries()) mapAB.set(k, {...(mapAB.get(k)||{}), ...v});
@@ -91,16 +101,21 @@ function diagHasLivePath(diag) {
91101
const hasA = Array.isArray(row.dns_a) ? row.dns_a.length>0 : Boolean(row.dns_a);
92102
const hasMX = Array.isArray(row.mx) ? row.mx.length>0 : Boolean(row.mx);
93103
const html = row.http_similarity;
94-
// different versions may store image similarity under different keys
95-
const img = row.screenshot_similarity ?? row.phash_similarity ?? row.page_similarity ?? row.image_similarity ?? null;
96104

97-
// RDAP merge
105+
// image similarity from any source (DNSTwist, pHash, or our PNG comparison)
106+
let img = row.screenshot_similarity ?? row.phash_similarity ?? row.page_similarity ?? row.image_similarity ?? null;
107+
const snapSim = (S && typeof S[dom]?.image_similarity === "number") ? S[dom].image_similarity : null;
108+
if (typeof snapSim === "number") {
109+
img = (img == null) ? snapSim : Math.max(img, snapSim);
110+
}
111+
112+
// RDAP merge (if present)
98113
const rd = rdapByDomain.get(dom) || {};
99114
const registrar = rd.registrar ?? null;
100115
const abuse_emails = Array.isArray(rd.abuse_emails) ? rd.abuse_emails : [];
101116
const nameservers = Array.isArray(rd.nameservers) ? rd.nameservers : [];
102-
const age_days = (typeof rd.age_days === "number") ? rd.age_days
103-
: (typeof row.age_days === "number" ? row.age_days : null);
117+
const rd_age = (typeof rd.age_days === "number") ? rd.age_days : null;
118+
const age_days = (rd_age != null) ? rd_age : (typeof row.age_days === "number" ? row.age_days : null);
104119
const parkedNS = !!rd.parked_ns;
105120

106121
// LIVE gates
Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
#!/usr/bin/env node
2+
// Compare snaps/*.png to a reference app screenshot.
3+
// Outputs snap_similarity.json mapping domain -> {image_similarity, method, diff_path?}
4+
5+
import fs from "node:fs";
6+
import { promises as fsp } from "node:fs";
7+
import os from "node:os";
8+
import path from "node:path";
9+
import { spawnSync } from "node:child_process";
10+
import crypto from "node:crypto";
11+
12+
// ---- Chrome resolver (reuse from snapshot script) ----
13+
function resolveChromeBin() {
14+
const expandHome = p => (p && p.startsWith("~") ? path.join(os.homedir(), p.slice(1)) : p);
15+
const envVal = (process.env.CHROME_BIN || "").trim();
16+
if (envVal) {
17+
let p = expandHome(envVal);
18+
if (p.endsWith(".app")) {
19+
const cands = [
20+
path.join(p, "Contents/MacOS/Google Chrome"),
21+
path.join(p, "Contents/MacOS/Chromium"),
22+
path.join(p, "Contents/MacOS/Brave Browser"),
23+
path.join(p, "Contents/MacOS/Microsoft Edge"),
24+
];
25+
for (const c of cands) try { if (fs.existsSync(c) && fs.statSync(c).isFile()) return c; } catch {}
26+
}
27+
try { if (fs.existsSync(p) && fs.statSync(p).isFile()) return p; } catch {}
28+
}
29+
const mac = [
30+
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
31+
"/Applications/Chromium.app/Contents/MacOS/Chromium",
32+
"/Applications/Brave Browser.app/Contents/MacOS/Brave Browser",
33+
"/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge",
34+
];
35+
for (const c of mac) try { if (fs.existsSync(c) && fs.statSync(c).isFile()) return c; } catch {}
36+
const whiches = ["google-chrome-stable","google-chrome","chromium","chromium-browser","brave-browser","brave","microsoft-edge"];
37+
for (const w of whiches) {
38+
try {
39+
const out = spawnSync("bash", ["-lc", `command -v ${w}`], { encoding:"utf8" });
40+
const p = out.stdout.trim();
41+
if (p && fs.existsSync(p) && fs.statSync(p).isFile()) return p;
42+
} catch {}
43+
}
44+
return "";
45+
}
46+
47+
function takeRefShot(chrome, url, outPng) {
48+
fs.mkdirSync(path.dirname(outPng), { recursive: true });
49+
const args = [
50+
"--headless=new","--no-sandbox","--disable-gpu","--disable-dev-shm-usage",
51+
"--hide-scrollbars","--no-first-run","--no-default-browser-check","--no-zygote",
52+
"--window-size=1366,768", `--screenshot=${outPng}`, url
53+
];
54+
const env = { ...process.env, DBUS_SESSION_BUS_ADDRESS:"/dev/null", XDG_RUNTIME_DIR:"/tmp", LIBGL_ALWAYS_SOFTWARE:"1" };
55+
const proc = spawnSync(chrome, args, { encoding:"utf8", env, timeout: 20000 });
56+
if (proc.status !== 0 || !fs.existsSync(outPng)) {
57+
throw new Error(`reference shot failed (status ${proc.status}): ${proc.stderr || proc.stdout || ""}`);
58+
}
59+
}
60+
61+
// ---- tiny PNG reader + dHash/SSIM (no native deps) ----
62+
// We’ll use pure-js PNG decode to avoid npm native installs in Actions.
63+
import { PNG } from "pngjs"; // add to package.json devDeps
64+
function readPNG(p) {
65+
return new Promise((resolve, reject) => {
66+
fs.createReadStream(p).pipe(new PNG()).on("parsed", function () {
67+
resolve({ width: this.width, height: this.height, data: Buffer.from(this.data) });
68+
}).on("error", reject);
69+
});
70+
}
71+
function toGray(img) {
72+
const { width, height, data } = img;
73+
const g = new Uint8Array(width*height);
74+
for (let y=0, i=0; y<height; y++) for (let x=0; x<width; x++, i+=4) {
75+
const r=data[i], gch=data[i+1], b=data[i+2];
76+
g[y*width+x] = (0.299*r + 0.587*gch + 0.114*b) | 0;
77+
}
78+
return { width, height, data: g };
79+
}
80+
function resizeGray(img, w2, h2) {
81+
// nearest is fine for dHash/SSIM windowed; simple & fast
82+
const { width:w1, height:h1, data:d1 } = img;
83+
const d2 = new Uint8Array(w2*h2);
84+
for (let y=0; y<h2; y++){
85+
const yy = Math.min(h1-1, Math.round(y*(h1-1)/(h2-1)));
86+
for (let x=0; x<w2; x++){
87+
const xx = Math.min(w1-1, Math.round(x*(w1-1)/(w2-1)));
88+
d2[y*w2+x] = d1[yy*w1+xx];
89+
}
90+
}
91+
return { width:w2, height:h2, data:d2 };
92+
}
93+
function dHash64(gray) {
94+
// 9x8 compare horizontally
95+
const w=9, h=8;
96+
const r = resizeGray(gray, w, h).data;
97+
let bits = 0n;
98+
for (let y=0; y<h; y++) {
99+
for (let x=0; x<8; x++) {
100+
const a = r[y*w + x], b = r[y*w + x + 1];
101+
bits = (bits << 1n) | (a > b ? 1n : 0n);
102+
}
103+
}
104+
return bits; // 64-bit bigint
105+
}
106+
function hamming64(a,b) {
107+
let v = a ^ b;
108+
let count = 0n;
109+
while (v) { v &= (v - 1n); count++; }
110+
return Number(count);
111+
}
112+
// simple SSIM over resized 256x256 (fast-ish)
113+
function ssimFast(grayA, grayB) {
114+
const A = resizeGray(grayA, 256, 256).data;
115+
const B = resizeGray(grayB, 256, 256).data;
116+
// 8x8 window, constants per standard
117+
const K1=0.01, K2=0.03, L=255, C1=(K1*L)**2, C2=(K2*L)**2;
118+
let muA=0, muB=0, n=A.length;
119+
for (let i=0;i<n;i++){ muA+=A[i]; muB+=B[i]; }
120+
muA/=n; muB/=n;
121+
let sigmaA=0, sigmaB=0, sigmaAB=0;
122+
for (let i=0;i<n;i++){
123+
const da=A[i]-muA, db=B[i]-muB;
124+
sigmaA+=da*da; sigmaB+=db*db; sigmaAB+=da*db;
125+
}
126+
sigmaA/=n-1; sigmaB/=n-1; sigmaAB/=n-1;
127+
const num = (2*muA*muB + C1) * (2*sigmaAB + C2);
128+
const den = (muA*muA + muB*muB + C1) * (sigmaA + sigmaB + C2);
129+
let s = num/den;
130+
if (!isFinite(s)) s = 0;
131+
return Math.max(0, Math.min(1, s));
132+
}
133+
134+
async function ensureReferencePng() {
135+
const refFile = process.env.REFERENCE_PNG || "reference_snaps/relay.link.png";
136+
if (fs.existsSync(refFile)) return refFile;
137+
138+
const url = process.env.REFERENCE_URL || "https://relay.link/";
139+
const chrome = resolveChromeBin();
140+
if (!chrome) throw new Error("No Chrome binary found for reference shot.");
141+
fs.mkdirSync(path.dirname(refFile), { recursive:true });
142+
takeRefShot(chrome, url, refFile);
143+
return refFile;
144+
}
145+
146+
async function main() {
147+
const refPng = await ensureReferencePng();
148+
const refImg = toGray(await readPNG(refPng));
149+
const refHash = dHash64(refImg);
150+
151+
const snapsDir = "snaps";
152+
if (!fs.existsSync(snapsDir)) { await fsp.writeFile("snap_similarity.json","{}"); console.log("No snaps/ folder."); return; }
153+
154+
const files = fs.readdirSync(snapsDir).filter(f => f.toLowerCase().endsWith(".png"));
155+
const out = {};
156+
157+
for (const f of files) {
158+
const domain = path.basename(f, ".png");
159+
const p = path.join(snapsDir, f);
160+
try {
161+
const img = toGray(await readPNG(p));
162+
const hash = dHash64(img);
163+
const dist = hamming64(refHash, hash); // 0..64
164+
const dhashSim = 1 - dist/64; // 0..1
165+
const ssim = ssimFast(refImg, img); // 0..1
166+
const combined = Math.round((0.6*ssim + 0.4*dhashSim) * 100); // weighted
167+
168+
out[domain] = { image_similarity: combined, method: "dhash+ssim" };
169+
} catch (e) {
170+
out[domain] = { image_similarity: null, method: "error", error: String(e) };
171+
}
172+
}
173+
174+
await fsp.writeFile("snap_similarity.json", JSON.stringify(out, null, 2));
175+
console.log(`Wrote snap_similarity.json for ${Object.keys(out).length} domains`);
176+
}
177+
178+
main().catch(e => { console.error(e); process.exit(1); });

0 commit comments

Comments
 (0)