@@ -92,7 +92,6 @@ async function fetchWithBackoff<T = unknown>(url: string, config: any, opts?: {
9292 }
9393}
9494
95-
9695function parseFinlexUrl ( url : string ) : { docYear : number ; docNumber : string ; docLanguage : string ; docVersion : string | null } {
9796 try {
9897 const urlObj = new URL ( url ) ;
@@ -291,6 +290,39 @@ async function parseCommonNamesFromXML(result: AxiosResponse<unknown>): Promise<
291290 return names
292291}
293292
293+ /**
294+ * Extract isInForce from the statute XML using fast-xml-parser.
295+ *
296+ * Returns:
297+ * - true / false when finlex:isInForce/@value is present
298+ * - null when the field is missing or can't be parsed
299+ */
300+ async function parseIsInForceFromXml ( xmlString : string ) : Promise < boolean | null > {
301+ try {
302+ const parser = new XMLParser ( {
303+ ignoreAttributes : false ,
304+ attributeNamePrefix : '@_' ,
305+ removeNSPrefix : true ,
306+ } ) ;
307+
308+ const parsed = parser . parse ( xmlString ) ;
309+
310+ const isInForceNode =
311+ parsed ?. AknXmlList ?. Results ?. akomaNtoso ?. act ?. meta ?. proprietary ?. isInForce ??
312+ parsed ?. akomaNtoso ?. act ?. meta ?. proprietary ?. isInForce ??
313+ null ;
314+
315+ const value = isInForceNode ?. [ '@_value' ] ;
316+
317+ if ( value === 'true' ) return true ;
318+ if ( value === 'false' ) return false ;
319+ return null ;
320+ } catch ( e ) {
321+ console . warn ( 'Failed to parse isInForce from XML:' , e ) ;
322+ return null ;
323+ }
324+ }
325+
294326async function parseKeywordsfromXML ( result : AxiosResponse < unknown > ) : Promise < [ string , string ] [ ] > {
295327 const keyword_list : [ string , string ] [ ] = [ ] ;
296328
@@ -473,10 +505,7 @@ function parseURLfromJudgmentID(judgmentID: string): string {
473505}
474506
475507function detectLanguage ( text : string ) : 'fin' | 'swe' | 'unknown' {
476- // Simple heuristic language detection for Finnish vs Swedish
477508 const lowerText = text . toLowerCase ( ) ;
478-
479- // Common Finnish words and patterns
480509 const finnishIndicators = [
481510 'että' , 'jossa' , 'jonka' , 'kanssa' , 'mukaan' , 'joiden' , 'jotka' ,
482511 'vuonna' , 'vuoden' , 'korkein oikeus' , 'hovioikeus' , 'käräjäoikeus' ,
@@ -486,8 +515,6 @@ function detectLanguage(text: string): 'fin' | 'swe' | 'unknown' {
486515 'tämä' , 'näin' , 'sekä' , 'myös' , 'vain' , 'kuin' , 'ilman' ,
487516 'saada' , 'tehdä' , 'antaa' , 'pitää' , 'tulla' , 'voida' , 'käydä' ,
488517 ] ;
489-
490- // Common Swedish words and patterns
491518 const swedishIndicators = [
492519 'att' , 'som' , 'med' , 'enligt' , 'från' , 'till' , 'har' , 'eller' ,
493520 'år' , 'året' , 'högsta domstolen' , 'hovrätt' , 'tingsrätt' ,
@@ -500,27 +527,14 @@ function detectLanguage(text: string): 'fin' | 'swe' | 'unknown' {
500527
501528 let finnishScore = 0 ;
502529 let swedishScore = 0 ;
503-
504- for ( const indicator of finnishIndicators ) {
505- if ( lowerText . includes ( indicator ) ) finnishScore ++ ;
506- }
507-
508- for ( const indicator of swedishIndicators ) {
509- if ( lowerText . includes ( indicator ) ) swedishScore ++ ;
510- }
511-
512- // Character patterns: å is Swedish-specific (strong signal)
530+ for ( const indicator of finnishIndicators ) if ( lowerText . includes ( indicator ) ) finnishScore ++ ;
531+ for ( const indicator of swedishIndicators ) if ( lowerText . includes ( indicator ) ) swedishScore ++ ;
513532 const aRingCount = ( text . match ( / å / gi) || [ ] ) . length ;
514533 swedishScore += aRingCount * 3 ;
515-
516- // Finnish tends to have more double vowels
517534 const doubleVowels = text . match ( / ( a a | e e | i i | o o | u u | y y | ä ä | ö ö ) / gi) ;
518535 if ( doubleVowels && doubleVowels . length > 1 ) finnishScore += 2 ;
519-
520- // Default to unknown if score is too low to be confident
521536 const totalScore = finnishScore + swedishScore ;
522537 if ( totalScore < 2 ) return 'unknown' ;
523-
524538 if ( finnishScore > swedishScore ) return 'fin' ;
525539 if ( swedishScore > finnishScore ) return 'swe' ;
526540 return 'unknown' ;
@@ -529,16 +543,10 @@ function detectLanguage(text: string): 'fin' | 'swe' | 'unknown' {
529543function parseFlightStreamContent ( html : string , lang ?: 'fin' | 'swe' ) : string [ ] {
530544 const scriptRegex = / < s c r i p t > s e l f \. _ _ n e x t _ f \. p u s h \( \[ 1 , ( .* ?) \] \) < \/ s c r i p t > / gs;
531545 const matches = Array . from ( html . matchAll ( scriptRegex ) ) ;
532-
533- if ( matches . length === 0 ) {
534- return [ ] ;
535- }
536-
546+ if ( matches . length === 0 ) return [ ] ;
537547 const combinedPayload = matches . map ( m => m [ 1 ] ) . join ( '\n' ) ;
538-
539548 const highlightableRegex = / \\ " c l a s s N a m e \\ " : \\ " h i g h l i g h t a b l e \\ " , \\ " c h i l d r e n \\ " : \\ " ( (?: [ ^ " \\ ] | \\ .) * ?) \\ " [ } \] ] / g;
540549 const contentMatches = Array . from ( combinedPayload . matchAll ( highlightableRegex ) ) ;
541-
542550 const fragments : string [ ] = [ ] ;
543551 for ( const match of contentMatches ) {
544552 let text = match [ 1 ]
@@ -547,49 +555,34 @@ function parseFlightStreamContent(html: string, lang?: 'fin' | 'swe'): string[]
547555 . replace ( / \\ n / g, '\n' )
548556 . replace ( / \\ r / g, '' )
549557 . trim ( ) ;
550-
551- if ( text &&
558+ if ( text &&
552559 text . length > 3 &&
553560 ! text . match ( / ^ [ a - f 0 - 9 ] + : / ) &&
554561 ! text . match ( / ^ \$ / ) &&
555562 ! text . includes ( '$undefined' ) &&
556563 ! text . includes ( '"className"' ) &&
557564 ! text . includes ( '"style"' ) ) {
558-
559- // If language filtering is requested, detect the language of this paragraph
560565 if ( lang ) {
561566 const detectedLang = detectLanguage ( text ) ;
562- // Only include paragraphs that definitively match the target language
563- if ( detectedLang !== lang ) {
564- continue ; // Skip paragraphs that are wrong language OR unknown
565- }
567+ if ( detectedLang !== lang ) continue ;
566568 }
567-
568569 fragments . push ( text ) ;
569570 }
570571 }
571-
572572 return fragments ;
573573}
574574
575575function extractLangSectionFromDom ( inputHTML : string , lang : 'fin' | 'swe' ) : { content : string ; is_empty : boolean } | null {
576576 const dom = new JSDOM ( inputHTML ) ;
577577 const doc = dom . window . document ;
578-
579- // Finlex uses two-letter language tags in the rendered Akomantoso section
580578 const langCode = lang === 'fin' ? 'fi' : 'sv' ;
581579 const section =
582580 doc . querySelector ( `section[class*="akomaNtoso"][lang="${ langCode } "]` ) ||
583581 doc . querySelector ( `section[class*="akomaNtoso"][lang="${ langCode . toUpperCase ( ) } "]` ) ||
584582 doc . querySelector ( 'section[class*="akomaNtoso"]' ) ;
585-
586- if ( ! section ) {
587- return null ;
588- }
589-
583+ if ( ! section ) return null ;
590584 const paragraphs = section . querySelectorAll ( 'p' ) ;
591585 const is_empty = ! Array . from ( paragraphs ) . some ( p => ( p . textContent ?? '' ) . trim ( ) !== '' ) ;
592-
593586 return { content : section . outerHTML , is_empty } ;
594587}
595588
@@ -600,39 +593,30 @@ async function parseAkomafromURL(inputURL: string, lang: string): Promise<{ cont
600593 const inputHTML = result . data as string ;
601594 const keywords = parseKeywordsfromHTML ( inputHTML , lang ) ;
602595
603- // Prefer DOM extraction scoped by the explicit lang attribute to avoid mixed-language payloads.
604596 const domSection = extractLangSectionFromDom ( inputHTML , lang === 'fin' ? 'fin' : 'swe' ) ;
605597 if ( domSection ) {
606598 return { content : domSection . content , is_empty : domSection . is_empty , keywords } ;
607599 }
608600
609601 const flightFragments = parseFlightStreamContent ( inputHTML , lang === 'fin' ? 'fin' : 'swe' ) ;
610-
611602 if ( flightFragments . length > 0 ) {
612603 const paragraphs = flightFragments
613604 . map ( text => `<p class="highlightable">${ text } </p>` )
614605 . join ( '\n' ) ;
615-
616606 const content = `<section class="styles_akomaNtoso__parsed">\n${ paragraphs } \n</section>` ;
617607 const is_empty = flightFragments . length === 0 || flightFragments . every ( f => f . trim ( ) === '' ) ;
618-
619608 return { content, is_empty, keywords } ;
620609 }
621610
622- // Fallback to DOM parsing for older pages
623611 const dom = new JSDOM ( inputHTML ) ;
624612 const doc = dom . window . document ;
625613 const section = doc . querySelector ( 'section[class*="akomaNtoso"]' ) ;
626-
627614 let is_empty = true ;
628-
629615 if ( section ) {
630616 const paragraphs = section . querySelectorAll ( 'p' ) ;
631617 is_empty = ! Array . from ( paragraphs ) . some ( p => ( p . textContent ?? '' ) . trim ( ) !== '' ) ;
632618 }
633-
634619 const content = section ? section . outerHTML : '' ;
635-
636620 return { content, is_empty, keywords } ;
637621}
638622
@@ -644,7 +628,6 @@ async function checkIsXMLEmpty(xmlString: string): Promise<boolean> {
644628 const parsed = parser . parse ( xmlString ) ;
645629
646630 const body = parsed ?. [ 'akomaNtoso' ] ?. [ 'act' ] ?. [ 'body' ] ;
647-
648631 if ( ! body ) return false ;
649632
650633 const container = body [ 'hcontainer' ] ;
@@ -657,7 +640,6 @@ async function checkIsXMLEmpty(xmlString: string): Promise<boolean> {
657640 }
658641}
659642
660-
661643const baseURL = 'https://opendata.finlex.fi/finlex/avoindata/v1' ;
662644
663645async function setImages ( statuteUuid : string , docYear : number , docNumber : string , language : string , version : string | null , uris : string [ ] ) {
@@ -715,12 +697,25 @@ async function setSingleStatute(uris : { uri: string, uriOld: string}) {
715697 }
716698 }
717699
700+ const xmlContent = result . data as string ;
701+ const isInForce = await parseIsInForceFromXml ( xmlContent ) ;
702+
703+ if ( isInForce === false ) {
704+ const { docYear, docNumber, docLanguage, docVersion } = parseFinlexUrl ( uri ) ;
705+ console . log (
706+ `[statute-loader] skipped (isInForce=false): ` +
707+ `${ docYear } /${ docNumber } /${ docLanguage } @${ docVersion ?? '' } ` +
708+ `uri=${ uri } `
709+ ) ;
710+ return null ;
711+ }
712+ // ------------------------------------------------------
713+
718714 const docTitle = await parseTitlefromXML ( result )
719715 const imageLinks = await parseImagesfromXML ( result )
720716 const keywordList = await parseKeywordsfromXML ( result )
721717 const commonNames = await parseCommonNamesFromXML ( result )
722718
723- const xmlContent = result . data as string ;
724719 const is_empty = await checkIsXMLEmpty ( xmlContent ) ;
725720
726721 const { docYear, docNumber, docLanguage, docVersion } = parseFinlexUrl ( uri )
0 commit comments