Skip to content

Commit 034023f

Browse files
authored
Merge pull request #169 from fmacpro/develop
1.2.2 Improved container detection, content sanitization, and readability metrics
2 parents efdc7dd + 85f1031 commit 034023f

File tree

11 files changed

+682
-1167
lines changed

11 files changed

+682
-1167
lines changed

APIDOC.md

Lines changed: 0 additions & 103 deletions
This file was deleted.

README.md

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -394,8 +394,6 @@ Horseman automatically detects the article language and exposes ISO codes via `a
394394

395395
Please feel free to fork the repo or open pull requests to the development branch. I've used [eslint](https://eslint.org/) for linting.
396396

397-
[Module API Docs](https://github.com/fmacpro/horseman-article-parser/blob/development/APIDOC.md)
398-
399397
Build the dependencies with:
400398

401399
```
@@ -432,8 +430,6 @@ Run quick tests and batches from this repo without writing code.
432430
- `npm run batch:crawl -- --urls-file scripts/data/urls.txt --out-file scripts/data/candidates_with_url.csv --start 0 --limit 200 --concurrency 1 --unique-hosts --progress-only`
433431
- train:ranker: Train reranker weights from a candidates CSV.
434432
- `npm run train:ranker -- <candidatesCsv>`
435-
- docs: Generate API docs to `APIDOC.md`.
436-
- `npm run docs`
437433

438434
### Common arguments
439435

controllers/contentDetector.js

Lines changed: 170 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -296,40 +296,180 @@ function findBestPreferredDescendant(root, options = {}) {
296296
return (valid || heavy) ? best : null
297297
}
298298

299-
// Heuristic: If content paragraphs are split across multiple sibling containers within
300-
// a higher-level container (e.g., ARTICLE), prefer that parent container to avoid fragmenting
301-
// the article body selection.
302-
function findFragmentedAncestor(node, options = {}, document) {
303-
if (!node || !document) return null
299+
function evaluateFragmentation(node, options = {}) {
300+
const result = {
301+
qualifies: false,
302+
parts: [],
303+
partsText: 0,
304+
totalText: 0,
305+
effectiveTotal: 0,
306+
ratio: 0,
307+
linkDensity: 0
308+
}
309+
if (!node || !node.children || !node.children.length) return result
310+
311+
const tagName = (node.tagName || '').toUpperCase()
312+
if (!tagName || tagName === 'HTML' || tagName === 'BODY') return result
313+
304314
const minLen = (options.contentDetection && options.contentDetection.minLength) || 400
305315
const maxLD = (options.contentDetection && options.contentDetection.maxLinkDensity) || 0.5
306316
const fragCfg = (options.contentDetection && options.contentDetection.fragment) || {}
307317
const cfgMinParts = Number.isFinite(fragCfg.minParts) ? fragCfg.minParts : 2
308318
const cfgMinChildChars = Number.isFinite(fragCfg.minChildChars) ? fragCfg.minChildChars : 150
309-
const cfgMinCombinedChars = Number.isFinite(fragCfg.minCombinedChars) ? fragCfg.minCombinedChars : Math.max(minLen, 400)
319+
const cfgMinCombinedChars = Number.isFinite(fragCfg.minCombinedChars)
320+
? fragCfg.minCombinedChars
321+
: Math.max(minLen, 400)
310322
const cfgMaxLD = (fragCfg.maxLinkDensity != null && Number.isFinite(fragCfg.maxLinkDensity))
311323
? fragCfg.maxLinkDensity
312324
: Math.max(maxLD, 0.65)
313-
const CONTAINERS = new Set(['ARTICLE','SECTION','MAIN'])
325+
326+
for (const child of Array.from(node.children || [])) {
327+
if (!child || child.nodeType !== 1) continue
328+
let textLen = 0
329+
try { textLen = getText(child).length } catch { textLen = 0 }
330+
if (textLen < cfgMinChildChars) continue
331+
const paras = paragraphCount(child)
332+
if (paras < 1) continue
333+
result.parts.push({ child, textLen, paras })
334+
result.partsText += textLen
335+
}
336+
337+
if (result.parts.length < cfgMinParts) return result
338+
339+
let totalText = 0
340+
try { totalText = getText(node).length } catch { totalText = 0 }
341+
result.totalText = totalText
342+
const effectiveTotal = totalText > 0 ? totalText : result.partsText
343+
result.effectiveTotal = effectiveTotal
344+
if (result.partsText < Math.min(effectiveTotal, cfgMinCombinedChars)) return result
345+
346+
const sorted = result.parts.slice().sort((a, b) => (b.textLen || 0) - (a.textLen || 0))
347+
const largest = sorted[0] ? sorted[0].textLen : 0
348+
const rest = Math.max(0, result.partsText - largest)
349+
const ratio = result.partsText > 0 ? rest / result.partsText : 0
350+
result.ratio = ratio
351+
if (ratio < 0.35) return result
352+
353+
const ld = linkDensity(node)
354+
result.linkDensity = ld
355+
if (ld > cfgMaxLD) return result
356+
357+
result.qualifies = true
358+
return result
359+
}
360+
361+
// Heuristic: If content paragraphs are split across multiple sibling containers within
362+
// a higher-level container (e.g., ARTICLE), prefer that parent container to avoid fragmenting
363+
// the article body selection.
364+
function findFragmentedAncestor(node, options = {}, document) {
365+
if (!node || !document) return null
366+
const origin = node
367+
const maxDepth = 12
314368
let cur = node
369+
for (let depth = 0; cur && cur.parentElement && depth < maxDepth; depth++) {
370+
const parent = cur.parentElement
371+
if (!parent) break
372+
const analysis = evaluateFragmentation(parent, options)
373+
if (analysis.qualifies) {
374+
const includesOrigin = analysis.parts.some(part => {
375+
try { return part.child && part.child.contains(origin) } catch { return false }
376+
})
377+
if (includesOrigin) return parent
378+
}
379+
cur = parent
380+
}
381+
382+
const CONTAINERS = new Set(['ARTICLE','SECTION','MAIN'])
383+
cur = node
315384
while (cur && cur.parentElement) {
316385
if (CONTAINERS.has(cur.tagName)) {
317-
const children = Array.from(cur.children || [])
318-
const parts = children.filter(c => {
319-
try { return paragraphCount(c) >= 1 && getText(c).length >= cfgMinChildChars } catch { return false }
320-
})
321-
const totalText = getText(cur).length
322-
const partsText = parts.reduce((acc, c) => acc + getText(c).length, 0)
323-
const ld = linkDensity(cur)
324-
if (parts.length >= cfgMinParts && partsText >= Math.min(totalText, cfgMinCombinedChars) && ld <= cfgMaxLD) {
325-
return cur
386+
const analysis = evaluateFragmentation(cur, options)
387+
if (analysis.qualifies) {
388+
const includesOrigin = analysis.parts.some(part => {
389+
try { return part.child && part.child.contains(origin) } catch { return false }
390+
})
391+
if (includesOrigin) return cur
326392
}
327393
}
328394
cur = cur.parentElement
329395
}
330396
return null
331397
}
332398

399+
function isFragmentedNode(node, options = {}) {
400+
if (!node || !node.children) return false
401+
const analysis = evaluateFragmentation(node, options)
402+
return analysis.qualifies
403+
}
404+
405+
function preferDirectParagraphContainer(node, options = {}) {
406+
if (!node || !node.children) return node
407+
const maxLD = (options.contentDetection && options.contentDetection.maxLinkDensity) || 0.5
408+
const fragCfg = (options.contentDetection && options.contentDetection.fragment) || {}
409+
const cfgMinChildChars = Number.isFinite(fragCfg.minChildChars) ? fragCfg.minChildChars : 150
410+
const visited = new Set()
411+
let current = node
412+
let fallback = containsSemantic(node) ? node : null
413+
const maxSteps = 8
414+
415+
for (let depth = 0; depth < maxSteps; depth++) {
416+
if (!current || visited.has(current)) break
417+
visited.add(current)
418+
419+
if (containsSemantic(current)) fallback = current
420+
421+
const directP = countDirect(current, 'p')
422+
if (directP >= 1) return current
423+
424+
if (!current.children || !current.children.length) break
425+
426+
if (isFragmentedNode(current, options)) break
427+
428+
const children = Array.from(current.children).filter(c => c && c.nodeType === 1)
429+
if (!children.length) break
430+
431+
let totalText = 0
432+
try { totalText = getText(current).length } catch { totalText = 0 }
433+
const totalParas = paragraphCount(current)
434+
435+
const candidates = []
436+
for (const child of children) {
437+
let textLen = 0
438+
try { textLen = getText(child).length } catch { textLen = 0 }
439+
if (textLen < cfgMinChildChars) continue
440+
const paras = paragraphCount(child)
441+
if (paras < 1) continue
442+
const ld = linkDensity(child)
443+
if (ld > Math.max(maxLD, 0.7)) continue
444+
candidates.push({ child, textLen, paras, ld, directP: countDirect(child, 'p') })
445+
}
446+
447+
if (!candidates.length) break
448+
449+
candidates.sort((a, b) => (b.textLen || 0) - (a.textLen || 0))
450+
const best = candidates[0]
451+
const bestLenRatio = totalText > 0 ? best.textLen / totalText : 1
452+
const bestParaRatio = totalParas > 0 ? best.paras / totalParas : (best.paras > 0 ? 1 : 0)
453+
454+
if (bestLenRatio < 0.45 && bestParaRatio < 0.7 && best.directP === 0) break
455+
456+
const second = candidates[1]
457+
if (second) {
458+
const secondLenRatio = totalText > 0 ? second.textLen / totalText : 0
459+
if (secondLenRatio >= 0.35) break
460+
}
461+
462+
if (isFragmentedNode(best.child, options)) break
463+
464+
if (containsSemantic(best.child)) fallback = best.child
465+
466+
current = best.child
467+
}
468+
469+
if (countDirect(current, 'p') >= 1) return current
470+
return fallback || current
471+
}
472+
333473
function getXPath(node) {
334474
try {
335475
if (!node || !node.ownerDocument) return ''
@@ -675,6 +815,20 @@ export function detectContent(document, options = {}, seeds = {}) {
675815
}
676816
}
677817

818+
// Prefer a direct paragraph container when available, while preserving fragmented articles.
819+
try {
820+
if (selected && selected.el) {
821+
const refinedDirect = preferDirectParagraphContainer(selected.el, options)
822+
if (refinedDirect && refinedDirect !== selected.el) {
823+
const cleanDirect = stripBadContainers(refinedDirect)
824+
if (cleanDirect && cleanDirect.innerHTML && cleanDirect.innerHTML.trim().length > 0) {
825+
html = cleanDirect.innerHTML
826+
selected = { el: refinedDirect }
827+
}
828+
}
829+
}
830+
} catch { /* ignore */ }
831+
678832
// Descendant promotion: if selection is BODY, but BODY contains a strong
679833
// preferred content descendant, promote to that descendant.
680834
try {

0 commit comments

Comments
 (0)