@@ -296,40 +296,180 @@ function findBestPreferredDescendant(root, options = {}) {
296296 return ( valid || heavy ) ? best : null
297297}
298298
299- // Heuristic: If content paragraphs are split across multiple sibling containers within
300- // a higher-level container (e.g., ARTICLE), prefer that parent container to avoid fragmenting
301- // the article body selection.
302- function findFragmentedAncestor ( node , options = { } , document ) {
303- if ( ! node || ! document ) return null
299+ function evaluateFragmentation ( node , options = { } ) {
300+ const result = {
301+ qualifies : false ,
302+ parts : [ ] ,
303+ partsText : 0 ,
304+ totalText : 0 ,
305+ effectiveTotal : 0 ,
306+ ratio : 0 ,
307+ linkDensity : 0
308+ }
309+ if ( ! node || ! node . children || ! node . children . length ) return result
310+
311+ const tagName = ( node . tagName || '' ) . toUpperCase ( )
312+ if ( ! tagName || tagName === 'HTML' || tagName === 'BODY' ) return result
313+
304314 const minLen = ( options . contentDetection && options . contentDetection . minLength ) || 400
305315 const maxLD = ( options . contentDetection && options . contentDetection . maxLinkDensity ) || 0.5
306316 const fragCfg = ( options . contentDetection && options . contentDetection . fragment ) || { }
307317 const cfgMinParts = Number . isFinite ( fragCfg . minParts ) ? fragCfg . minParts : 2
308318 const cfgMinChildChars = Number . isFinite ( fragCfg . minChildChars ) ? fragCfg . minChildChars : 150
309- const cfgMinCombinedChars = Number . isFinite ( fragCfg . minCombinedChars ) ? fragCfg . minCombinedChars : Math . max ( minLen , 400 )
319+ const cfgMinCombinedChars = Number . isFinite ( fragCfg . minCombinedChars )
320+ ? fragCfg . minCombinedChars
321+ : Math . max ( minLen , 400 )
310322 const cfgMaxLD = ( fragCfg . maxLinkDensity != null && Number . isFinite ( fragCfg . maxLinkDensity ) )
311323 ? fragCfg . maxLinkDensity
312324 : Math . max ( maxLD , 0.65 )
313- const CONTAINERS = new Set ( [ 'ARTICLE' , 'SECTION' , 'MAIN' ] )
325+
326+ for ( const child of Array . from ( node . children || [ ] ) ) {
327+ if ( ! child || child . nodeType !== 1 ) continue
328+ let textLen = 0
329+ try { textLen = getText ( child ) . length } catch { textLen = 0 }
330+ if ( textLen < cfgMinChildChars ) continue
331+ const paras = paragraphCount ( child )
332+ if ( paras < 1 ) continue
333+ result . parts . push ( { child, textLen, paras } )
334+ result . partsText += textLen
335+ }
336+
337+ if ( result . parts . length < cfgMinParts ) return result
338+
339+ let totalText = 0
340+ try { totalText = getText ( node ) . length } catch { totalText = 0 }
341+ result . totalText = totalText
342+ const effectiveTotal = totalText > 0 ? totalText : result . partsText
343+ result . effectiveTotal = effectiveTotal
344+ if ( result . partsText < Math . min ( effectiveTotal , cfgMinCombinedChars ) ) return result
345+
346+ const sorted = result . parts . slice ( ) . sort ( ( a , b ) => ( b . textLen || 0 ) - ( a . textLen || 0 ) )
347+ const largest = sorted [ 0 ] ? sorted [ 0 ] . textLen : 0
348+ const rest = Math . max ( 0 , result . partsText - largest )
349+ const ratio = result . partsText > 0 ? rest / result . partsText : 0
350+ result . ratio = ratio
351+ if ( ratio < 0.35 ) return result
352+
353+ const ld = linkDensity ( node )
354+ result . linkDensity = ld
355+ if ( ld > cfgMaxLD ) return result
356+
357+ result . qualifies = true
358+ return result
359+ }
360+
361+ // Heuristic: If content paragraphs are split across multiple sibling containers within
362+ // a higher-level container (e.g., ARTICLE), prefer that parent container to avoid fragmenting
363+ // the article body selection.
364+ function findFragmentedAncestor ( node , options = { } , document ) {
365+ if ( ! node || ! document ) return null
366+ const origin = node
367+ const maxDepth = 12
314368 let cur = node
369+ for ( let depth = 0 ; cur && cur . parentElement && depth < maxDepth ; depth ++ ) {
370+ const parent = cur . parentElement
371+ if ( ! parent ) break
372+ const analysis = evaluateFragmentation ( parent , options )
373+ if ( analysis . qualifies ) {
374+ const includesOrigin = analysis . parts . some ( part => {
375+ try { return part . child && part . child . contains ( origin ) } catch { return false }
376+ } )
377+ if ( includesOrigin ) return parent
378+ }
379+ cur = parent
380+ }
381+
382+ const CONTAINERS = new Set ( [ 'ARTICLE' , 'SECTION' , 'MAIN' ] )
383+ cur = node
315384 while ( cur && cur . parentElement ) {
316385 if ( CONTAINERS . has ( cur . tagName ) ) {
317- const children = Array . from ( cur . children || [ ] )
318- const parts = children . filter ( c => {
319- try { return paragraphCount ( c ) >= 1 && getText ( c ) . length >= cfgMinChildChars } catch { return false }
320- } )
321- const totalText = getText ( cur ) . length
322- const partsText = parts . reduce ( ( acc , c ) => acc + getText ( c ) . length , 0 )
323- const ld = linkDensity ( cur )
324- if ( parts . length >= cfgMinParts && partsText >= Math . min ( totalText , cfgMinCombinedChars ) && ld <= cfgMaxLD ) {
325- return cur
386+ const analysis = evaluateFragmentation ( cur , options )
387+ if ( analysis . qualifies ) {
388+ const includesOrigin = analysis . parts . some ( part => {
389+ try { return part . child && part . child . contains ( origin ) } catch { return false }
390+ } )
391+ if ( includesOrigin ) return cur
326392 }
327393 }
328394 cur = cur . parentElement
329395 }
330396 return null
331397}
332398
399+ function isFragmentedNode ( node , options = { } ) {
400+ if ( ! node || ! node . children ) return false
401+ const analysis = evaluateFragmentation ( node , options )
402+ return analysis . qualifies
403+ }
404+
405+ function preferDirectParagraphContainer ( node , options = { } ) {
406+ if ( ! node || ! node . children ) return node
407+ const maxLD = ( options . contentDetection && options . contentDetection . maxLinkDensity ) || 0.5
408+ const fragCfg = ( options . contentDetection && options . contentDetection . fragment ) || { }
409+ const cfgMinChildChars = Number . isFinite ( fragCfg . minChildChars ) ? fragCfg . minChildChars : 150
410+ const visited = new Set ( )
411+ let current = node
412+ let fallback = containsSemantic ( node ) ? node : null
413+ const maxSteps = 8
414+
415+ for ( let depth = 0 ; depth < maxSteps ; depth ++ ) {
416+ if ( ! current || visited . has ( current ) ) break
417+ visited . add ( current )
418+
419+ if ( containsSemantic ( current ) ) fallback = current
420+
421+ const directP = countDirect ( current , 'p' )
422+ if ( directP >= 1 ) return current
423+
424+ if ( ! current . children || ! current . children . length ) break
425+
426+ if ( isFragmentedNode ( current , options ) ) break
427+
428+ const children = Array . from ( current . children ) . filter ( c => c && c . nodeType === 1 )
429+ if ( ! children . length ) break
430+
431+ let totalText = 0
432+ try { totalText = getText ( current ) . length } catch { totalText = 0 }
433+ const totalParas = paragraphCount ( current )
434+
435+ const candidates = [ ]
436+ for ( const child of children ) {
437+ let textLen = 0
438+ try { textLen = getText ( child ) . length } catch { textLen = 0 }
439+ if ( textLen < cfgMinChildChars ) continue
440+ const paras = paragraphCount ( child )
441+ if ( paras < 1 ) continue
442+ const ld = linkDensity ( child )
443+ if ( ld > Math . max ( maxLD , 0.7 ) ) continue
444+ candidates . push ( { child, textLen, paras, ld, directP : countDirect ( child , 'p' ) } )
445+ }
446+
447+ if ( ! candidates . length ) break
448+
449+ candidates . sort ( ( a , b ) => ( b . textLen || 0 ) - ( a . textLen || 0 ) )
450+ const best = candidates [ 0 ]
451+ const bestLenRatio = totalText > 0 ? best . textLen / totalText : 1
452+ const bestParaRatio = totalParas > 0 ? best . paras / totalParas : ( best . paras > 0 ? 1 : 0 )
453+
454+ if ( bestLenRatio < 0.45 && bestParaRatio < 0.7 && best . directP === 0 ) break
455+
456+ const second = candidates [ 1 ]
457+ if ( second ) {
458+ const secondLenRatio = totalText > 0 ? second . textLen / totalText : 0
459+ if ( secondLenRatio >= 0.35 ) break
460+ }
461+
462+ if ( isFragmentedNode ( best . child , options ) ) break
463+
464+ if ( containsSemantic ( best . child ) ) fallback = best . child
465+
466+ current = best . child
467+ }
468+
469+ if ( countDirect ( current , 'p' ) >= 1 ) return current
470+ return fallback || current
471+ }
472+
333473function getXPath ( node ) {
334474 try {
335475 if ( ! node || ! node . ownerDocument ) return ''
@@ -675,6 +815,20 @@ export function detectContent(document, options = {}, seeds = {}) {
675815 }
676816 }
677817
818+ // Prefer a direct paragraph container when available, while preserving fragmented articles.
819+ try {
820+ if ( selected && selected . el ) {
821+ const refinedDirect = preferDirectParagraphContainer ( selected . el , options )
822+ if ( refinedDirect && refinedDirect !== selected . el ) {
823+ const cleanDirect = stripBadContainers ( refinedDirect )
824+ if ( cleanDirect && cleanDirect . innerHTML && cleanDirect . innerHTML . trim ( ) . length > 0 ) {
825+ html = cleanDirect . innerHTML
826+ selected = { el : refinedDirect }
827+ }
828+ }
829+ }
830+ } catch { /* ignore */ }
831+
678832 // Descendant promotion: if selection is BODY, but BODY contains a strong
679833 // preferred content descendant, promote to that descendant.
680834 try {
0 commit comments