Skip to content

Commit 279f121

Browse files
committed
alustava fi/sv jako oikeuskäytännölle, logiikka toimiva mutta epäsiisti
1 parent fc74406 commit 279f121

File tree

1 file changed

+84
-2
lines changed

1 file changed

+84
-2
lines changed

backend/src/db/load.ts

Lines changed: 84 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -312,7 +312,61 @@ function parseURLfromJudgmentID(judgmentID: string): string {
312312
}
313313
}
314314

315-
function parseFlightStreamContent(html: string): string[] {
315+
function detectLanguage(text: string): 'fin' | 'swe' | 'unknown' {
316+
// Simple heuristic language detection for Finnish vs Swedish
317+
const lowerText = text.toLowerCase();
318+
319+
// Common Finnish words and patterns
320+
const finnishIndicators = [
321+
'että', 'jossa', 'jonka', 'kanssa', 'mukaan', 'joiden', 'jotka',
322+
'vuonna', 'vuoden', 'korkein oikeus', 'hovioikeus', 'käräjäoikeus',
323+
'asiassa', 'kanne', 'valitus', 'tuomio', 'päätös', 'perustuslaki',
324+
'laki', 'säännös', 'oikeus', 'velvollisuus', 'sopimusrikkomus',
325+
'olla', 'ollut', 'ollut', 'ollaan', 'olleet', 'ovat', 'ole',
326+
'tämä', 'näin', 'sekä', 'myös', 'vain', 'kuin', 'ilman',
327+
'saada', 'tehdä', 'antaa', 'pitää', 'tulla', 'voida', 'käydä',
328+
];
329+
330+
// Common Swedish words and patterns
331+
const swedishIndicators = [
332+
'att', 'som', 'med', 'enligt', 'från', 'till', 'har', 'eller',
333+
'år', 'året', 'högsta domstolen', 'hovrätt', 'tingsrätt',
334+
'ärende', 'talan', 'besvär', 'dom', 'beslut', 'grundlag',
335+
'lag', 'bestämmelse', 'rätt', 'skyldighet', 'avtalsbrott',
336+
'vara', 'varit', 'är', 'var', 'hade', 'skulle', 'kunde',
337+
'denna', 'detta', 'den', 'det', 'och', 'även', 'bara',
338+
'få', 'göra', 'ge', 'hålla', 'komma', 'kunna', 'skall',
339+
];
340+
341+
let finnishScore = 0;
342+
let swedishScore = 0;
343+
344+
for (const indicator of finnishIndicators) {
345+
if (lowerText.includes(indicator)) finnishScore++;
346+
}
347+
348+
for (const indicator of swedishIndicators) {
349+
if (lowerText.includes(indicator)) swedishScore++;
350+
}
351+
352+
// Character patterns: å is Swedish-specific (strong signal)
353+
const aRingCount = (text.match(/å/gi) || []).length;
354+
swedishScore += aRingCount * 3;
355+
356+
// Finnish tends to have more double vowels
357+
const doubleVowels = text.match(/(aa|ee|ii|oo|uu|yy|ää|öö)/gi);
358+
if (doubleVowels && doubleVowels.length > 1) finnishScore += 2;
359+
360+
// Default to unknown if score is too low to be confident
361+
const totalScore = finnishScore + swedishScore;
362+
if (totalScore < 2) return 'unknown';
363+
364+
if (finnishScore > swedishScore) return 'fin';
365+
if (swedishScore > finnishScore) return 'swe';
366+
return 'unknown';
367+
}
368+
369+
function parseFlightStreamContent(html: string, lang?: 'fin' | 'swe'): string[] {
316370
const scriptRegex = /<script>self\.__next_f\.push\(\[1,(.*?)\]\)<\/script>/gs;
317371
const matches = Array.from(html.matchAll(scriptRegex));
318372

@@ -341,21 +395,49 @@ function parseFlightStreamContent(html: string): string[] {
341395
!text.includes('$undefined') &&
342396
!text.includes('"className"') &&
343397
!text.includes('"style"')) {
398+
399+
// If language filtering is requested, detect the language of this paragraph
400+
if (lang) {
401+
const detectedLang = detectLanguage(text);
402+
// Only include paragraphs that definitively match the target language
403+
if (detectedLang !== lang) {
404+
continue; // Skip paragraphs that are wrong language OR unknown
405+
}
406+
}
407+
344408
fragments.push(text);
345409
}
346410
}
347411

348412
return fragments;
349413
}
350414

415+
function extractLangSectionFromDom(inputHTML: string, lang: 'fin' | 'swe'): { content: string; is_empty: boolean } | null {
416+
const dom = new JSDOM(inputHTML);
417+
const doc = dom.window.document;
418+
419+
// Finlex uses two-letter language tags in the rendered Akomantoso section
420+
const langCode = lang === 'fin' ? 'fi' : 'sv';
421+
const section = doc.querySelector(`section[class*="akomaNtoso"][lang="${langCode}"]`) as HTMLElement | null;
422+
if (!section) return null;
423+
424+
const is_empty = (section.textContent ?? '').trim() === '';
425+
return { content: section.outerHTML, is_empty };
426+
}
427+
351428
async function parseAkomafromURL(inputURL: string, lang: string): Promise<{ content: string; is_empty: boolean, keywords: string[] }> {
352429
const result = await fetchWithBackoff<string>(inputURL, {
353430
headers: { 'Accept': 'text/html', 'Accept-Encoding': 'gzip' }
354431
});
355432
const inputHTML = result.data as string;
356433
const keywords = parseKeywordsfromHTML(inputHTML, lang);
434+
// Prefer DOM extraction scoped by the explicit lang attribute to avoid mixed-language payloads.
435+
const domSection = extractLangSectionFromDom(inputHTML, lang === 'fin' ? 'fin' : 'swe');
436+
if (domSection) {
437+
return { content: domSection.content, is_empty: domSection.is_empty, keywords };
438+
}
357439

358-
const flightFragments = parseFlightStreamContent(inputHTML);
440+
const flightFragments = parseFlightStreamContent(inputHTML, lang === 'fin' ? 'fin' : 'swe');
359441

360442
if (flightFragments.length > 0) {
361443
const paragraphs = flightFragments

0 commit comments

Comments
 (0)