@@ -312,7 +312,61 @@ function parseURLfromJudgmentID(judgmentID: string): string {
312312 }
313313}
314314
315- function parseFlightStreamContent ( html : string ) : string [ ] {
315+ function detectLanguage ( text : string ) : 'fin' | 'swe' | 'unknown' {
316+ // Simple heuristic language detection for Finnish vs Swedish
317+ const lowerText = text . toLowerCase ( ) ;
318+
319+ // Common Finnish words and patterns
320+ const finnishIndicators = [
321+ 'että' , 'jossa' , 'jonka' , 'kanssa' , 'mukaan' , 'joiden' , 'jotka' ,
322+ 'vuonna' , 'vuoden' , 'korkein oikeus' , 'hovioikeus' , 'käräjäoikeus' ,
323+ 'asiassa' , 'kanne' , 'valitus' , 'tuomio' , 'päätös' , 'perustuslaki' ,
324+ 'laki' , 'säännös' , 'oikeus' , 'velvollisuus' , 'sopimusrikkomus' ,
325+ 'olla' , 'ollut' , 'ollut' , 'ollaan' , 'olleet' , 'ovat' , 'ole' ,
326+ 'tämä' , 'näin' , 'sekä' , 'myös' , 'vain' , 'kuin' , 'ilman' ,
327+ 'saada' , 'tehdä' , 'antaa' , 'pitää' , 'tulla' , 'voida' , 'käydä' ,
328+ ] ;
329+
330+ // Common Swedish words and patterns
331+ const swedishIndicators = [
332+ 'att' , 'som' , 'med' , 'enligt' , 'från' , 'till' , 'har' , 'eller' ,
333+ 'år' , 'året' , 'högsta domstolen' , 'hovrätt' , 'tingsrätt' ,
334+ 'ärende' , 'talan' , 'besvär' , 'dom' , 'beslut' , 'grundlag' ,
335+ 'lag' , 'bestämmelse' , 'rätt' , 'skyldighet' , 'avtalsbrott' ,
336+ 'vara' , 'varit' , 'är' , 'var' , 'hade' , 'skulle' , 'kunde' ,
337+ 'denna' , 'detta' , 'den' , 'det' , 'och' , 'även' , 'bara' ,
338+ 'få' , 'göra' , 'ge' , 'hålla' , 'komma' , 'kunna' , 'skall' ,
339+ ] ;
340+
341+ let finnishScore = 0 ;
342+ let swedishScore = 0 ;
343+
344+ for ( const indicator of finnishIndicators ) {
345+ if ( lowerText . includes ( indicator ) ) finnishScore ++ ;
346+ }
347+
348+ for ( const indicator of swedishIndicators ) {
349+ if ( lowerText . includes ( indicator ) ) swedishScore ++ ;
350+ }
351+
352+ // Character patterns: å is Swedish-specific (strong signal)
353+ const aRingCount = ( text . match ( / å / gi) || [ ] ) . length ;
354+ swedishScore += aRingCount * 3 ;
355+
356+ // Finnish tends to have more double vowels
357+ const doubleVowels = text . match ( / ( a a | e e | i i | o o | u u | y y | ä ä | ö ö ) / gi) ;
358+ if ( doubleVowels && doubleVowels . length > 1 ) finnishScore += 2 ;
359+
360+ // Default to unknown if score is too low to be confident
361+ const totalScore = finnishScore + swedishScore ;
362+ if ( totalScore < 2 ) return 'unknown' ;
363+
364+ if ( finnishScore > swedishScore ) return 'fin' ;
365+ if ( swedishScore > finnishScore ) return 'swe' ;
366+ return 'unknown' ;
367+ }
368+
369+ function parseFlightStreamContent ( html : string , lang ?: 'fin' | 'swe' ) : string [ ] {
316370 const scriptRegex = / < s c r i p t > s e l f \. _ _ n e x t _ f \. p u s h \( \[ 1 , ( .* ?) \] \) < \/ s c r i p t > / gs;
317371 const matches = Array . from ( html . matchAll ( scriptRegex ) ) ;
318372
@@ -341,21 +395,49 @@ function parseFlightStreamContent(html: string): string[] {
341395 ! text . includes ( '$undefined' ) &&
342396 ! text . includes ( '"className"' ) &&
343397 ! text . includes ( '"style"' ) ) {
398+
399+ // If language filtering is requested, detect the language of this paragraph
400+ if ( lang ) {
401+ const detectedLang = detectLanguage ( text ) ;
402+ // Only include paragraphs that definitively match the target language
403+ if ( detectedLang !== lang ) {
404+ continue ; // Skip paragraphs that are wrong language OR unknown
405+ }
406+ }
407+
344408 fragments . push ( text ) ;
345409 }
346410 }
347411
348412 return fragments ;
349413}
350414
415+ function extractLangSectionFromDom ( inputHTML : string , lang : 'fin' | 'swe' ) : { content : string ; is_empty : boolean } | null {
416+ const dom = new JSDOM ( inputHTML ) ;
417+ const doc = dom . window . document ;
418+
419+ // Finlex uses two-letter language tags in the rendered Akomantoso section
420+ const langCode = lang === 'fin' ? 'fi' : 'sv' ;
421+ const section = doc . querySelector ( `section[class*="akomaNtoso"][lang="${ langCode } "]` ) as HTMLElement | null ;
422+ if ( ! section ) return null ;
423+
424+ const is_empty = ( section . textContent ?? '' ) . trim ( ) === '' ;
425+ return { content : section . outerHTML , is_empty } ;
426+ }
427+
351428async function parseAkomafromURL ( inputURL : string , lang : string ) : Promise < { content : string ; is_empty : boolean , keywords : string [ ] } > {
352429 const result = await fetchWithBackoff < string > ( inputURL , {
353430 headers : { 'Accept' : 'text/html' , 'Accept-Encoding' : 'gzip' }
354431 } ) ;
355432 const inputHTML = result . data as string ;
356433 const keywords = parseKeywordsfromHTML ( inputHTML , lang ) ;
434+ // Prefer DOM extraction scoped by the explicit lang attribute to avoid mixed-language payloads.
435+ const domSection = extractLangSectionFromDom ( inputHTML , lang === 'fin' ? 'fin' : 'swe' ) ;
436+ if ( domSection ) {
437+ return { content : domSection . content , is_empty : domSection . is_empty , keywords } ;
438+ }
357439
358- const flightFragments = parseFlightStreamContent ( inputHTML ) ;
440+ const flightFragments = parseFlightStreamContent ( inputHTML , lang === 'fin' ? 'fin' : 'swe' ) ;
359441
360442 if ( flightFragments . length > 0 ) {
361443 const paragraphs = flightFragments
0 commit comments