Skip to content

Commit 1980662

Browse files
committed
kumottujenpoisto-endpointin poisto, sama toiminto db-päivitykseen
1 parent 0a271b3 commit 1980662

File tree

2 files changed

+56
-65
lines changed

2 files changed

+56
-65
lines changed

README.md

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,9 @@ docker compose up
1212
## To Do
1313

1414
P1
15-
- Ainakin seuraavat lait eivät typesensessä oikein:
16-
- Laki rikosasioiden ja eräiden riita-asioiden sovittelusta
17-
- Laki riita-asioiden sovittelusta ja sovinnon vahvistamisesta yleisissä tuomioistuimissa
18-
- Normalisointi-endpoint ei tee siistiä jälkeä, pitäisi parantaa ja sitten ajaa koko tietokannalle
19-
- Asiasanahaku -> Asiasanat
20-
- Ä on lajiteltu A:na
21-
- Adoptio hukkunut ruotsinkieliselle puolelle
15+
- Muutossäädökset lakipykälien otsikoihin (xml-parsimisen muutos)
16+
- Asiasanat lakidokumentteihin näkyviin
17+
- DB-päivitys tekee typesensen vain aloitusvuodesta
2218

2319
P2
2420
- Status-taulukon käyttäytyminen vähän jank

backend/src/db/load.ts

Lines changed: 53 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,6 @@ async function fetchWithBackoff<T = unknown>(url: string, config: any, opts?: {
9292
}
9393
}
9494

95-
9695
function parseFinlexUrl(url: string): { docYear: number; docNumber: string; docLanguage: string; docVersion: string | null } {
9796
try {
9897
const urlObj = new URL(url);
@@ -291,6 +290,39 @@ async function parseCommonNamesFromXML(result: AxiosResponse<unknown>): Promise<
291290
return names
292291
}
293292

293+
/**
294+
* Extract isInForce from the statute XML using fast-xml-parser.
295+
*
296+
* Returns:
297+
* - true / false when finlex:isInForce/@value is present
298+
* - null when the field is missing or can't be parsed
299+
*/
300+
async function parseIsInForceFromXml(xmlString: string): Promise<boolean | null> {
301+
try {
302+
const parser = new XMLParser({
303+
ignoreAttributes: false,
304+
attributeNamePrefix: '@_',
305+
removeNSPrefix: true,
306+
});
307+
308+
const parsed = parser.parse(xmlString);
309+
310+
const isInForceNode =
311+
parsed?.AknXmlList?.Results?.akomaNtoso?.act?.meta?.proprietary?.isInForce ??
312+
parsed?.akomaNtoso?.act?.meta?.proprietary?.isInForce ??
313+
null;
314+
315+
const value = isInForceNode?.['@_value'];
316+
317+
if (value === 'true') return true;
318+
if (value === 'false') return false;
319+
return null;
320+
} catch (e) {
321+
console.warn('Failed to parse isInForce from XML:', e);
322+
return null;
323+
}
324+
}
325+
294326
async function parseKeywordsfromXML(result: AxiosResponse<unknown>): Promise<[string, string][]> {
295327
const keyword_list: [string, string][] = [];
296328

@@ -473,10 +505,7 @@ function parseURLfromJudgmentID(judgmentID: string): string {
473505
}
474506

475507
function detectLanguage(text: string): 'fin' | 'swe' | 'unknown' {
476-
// Simple heuristic language detection for Finnish vs Swedish
477508
const lowerText = text.toLowerCase();
478-
479-
// Common Finnish words and patterns
480509
const finnishIndicators = [
481510
'että', 'jossa', 'jonka', 'kanssa', 'mukaan', 'joiden', 'jotka',
482511
'vuonna', 'vuoden', 'korkein oikeus', 'hovioikeus', 'käräjäoikeus',
@@ -486,8 +515,6 @@ function detectLanguage(text: string): 'fin' | 'swe' | 'unknown' {
486515
'tämä', 'näin', 'sekä', 'myös', 'vain', 'kuin', 'ilman',
487516
'saada', 'tehdä', 'antaa', 'pitää', 'tulla', 'voida', 'käydä',
488517
];
489-
490-
// Common Swedish words and patterns
491518
const swedishIndicators = [
492519
'att', 'som', 'med', 'enligt', 'från', 'till', 'har', 'eller',
493520
'år', 'året', 'högsta domstolen', 'hovrätt', 'tingsrätt',
@@ -500,27 +527,14 @@ function detectLanguage(text: string): 'fin' | 'swe' | 'unknown' {
500527

501528
let finnishScore = 0;
502529
let swedishScore = 0;
503-
504-
for (const indicator of finnishIndicators) {
505-
if (lowerText.includes(indicator)) finnishScore++;
506-
}
507-
508-
for (const indicator of swedishIndicators) {
509-
if (lowerText.includes(indicator)) swedishScore++;
510-
}
511-
512-
// Character patterns: å is Swedish-specific (strong signal)
530+
for (const indicator of finnishIndicators) if (lowerText.includes(indicator)) finnishScore++;
531+
for (const indicator of swedishIndicators) if (lowerText.includes(indicator)) swedishScore++;
513532
const aRingCount = (text.match(/å/gi) || []).length;
514533
swedishScore += aRingCount * 3;
515-
516-
// Finnish tends to have more double vowels
517534
const doubleVowels = text.match(/(aa|ee|ii|oo|uu|yy|ää|öö)/gi);
518535
if (doubleVowels && doubleVowels.length > 1) finnishScore += 2;
519-
520-
// Default to unknown if score is too low to be confident
521536
const totalScore = finnishScore + swedishScore;
522537
if (totalScore < 2) return 'unknown';
523-
524538
if (finnishScore > swedishScore) return 'fin';
525539
if (swedishScore > finnishScore) return 'swe';
526540
return 'unknown';
@@ -529,16 +543,10 @@ function detectLanguage(text: string): 'fin' | 'swe' | 'unknown' {
529543
function parseFlightStreamContent(html: string, lang?: 'fin' | 'swe'): string[] {
530544
const scriptRegex = /<script>self\.__next_f\.push\(\[1,(.*?)\]\)<\/script>/gs;
531545
const matches = Array.from(html.matchAll(scriptRegex));
532-
533-
if (matches.length === 0) {
534-
return [];
535-
}
536-
546+
if (matches.length === 0) return [];
537547
const combinedPayload = matches.map(m => m[1]).join('\n');
538-
539548
const highlightableRegex = /\\"className\\":\\"highlightable\\",\\"children\\":\\"((?:[^"\\]|\\.)*?)\\"[}\]]/g;
540549
const contentMatches = Array.from(combinedPayload.matchAll(highlightableRegex));
541-
542550
const fragments: string[] = [];
543551
for (const match of contentMatches) {
544552
let text = match[1]
@@ -547,49 +555,34 @@ function parseFlightStreamContent(html: string, lang?: 'fin' | 'swe'): string[]
547555
.replace(/\\n/g, '\n')
548556
.replace(/\\r/g, '')
549557
.trim();
550-
551-
if (text &&
558+
if (text &&
552559
text.length > 3 &&
553560
!text.match(/^[a-f0-9]+:/) &&
554561
!text.match(/^\$/) &&
555562
!text.includes('$undefined') &&
556563
!text.includes('"className"') &&
557564
!text.includes('"style"')) {
558-
559-
// If language filtering is requested, detect the language of this paragraph
560565
if (lang) {
561566
const detectedLang = detectLanguage(text);
562-
// Only include paragraphs that definitively match the target language
563-
if (detectedLang !== lang) {
564-
continue; // Skip paragraphs that are wrong language OR unknown
565-
}
567+
if (detectedLang !== lang) continue;
566568
}
567-
568569
fragments.push(text);
569570
}
570571
}
571-
572572
return fragments;
573573
}
574574

575575
function extractLangSectionFromDom(inputHTML: string, lang: 'fin' | 'swe'): { content: string; is_empty: boolean } | null {
576576
const dom = new JSDOM(inputHTML);
577577
const doc = dom.window.document;
578-
579-
// Finlex uses two-letter language tags in the rendered Akomantoso section
580578
const langCode = lang === 'fin' ? 'fi' : 'sv';
581579
const section =
582580
doc.querySelector(`section[class*="akomaNtoso"][lang="${langCode}"]`) ||
583581
doc.querySelector(`section[class*="akomaNtoso"][lang="${langCode.toUpperCase()}"]`) ||
584582
doc.querySelector('section[class*="akomaNtoso"]');
585-
586-
if (!section) {
587-
return null;
588-
}
589-
583+
if (!section) return null;
590584
const paragraphs = section.querySelectorAll('p');
591585
const is_empty = !Array.from(paragraphs).some(p => (p.textContent ?? '').trim() !== '');
592-
593586
return { content: section.outerHTML, is_empty };
594587
}
595588

@@ -600,39 +593,30 @@ async function parseAkomafromURL(inputURL: string, lang: string): Promise<{ cont
600593
const inputHTML = result.data as string;
601594
const keywords = parseKeywordsfromHTML(inputHTML, lang);
602595

603-
// Prefer DOM extraction scoped by the explicit lang attribute to avoid mixed-language payloads.
604596
const domSection = extractLangSectionFromDom(inputHTML, lang === 'fin' ? 'fin' : 'swe');
605597
if (domSection) {
606598
return { content: domSection.content, is_empty: domSection.is_empty, keywords };
607599
}
608600

609601
const flightFragments = parseFlightStreamContent(inputHTML, lang === 'fin' ? 'fin' : 'swe');
610-
611602
if (flightFragments.length > 0) {
612603
const paragraphs = flightFragments
613604
.map(text => `<p class="highlightable">${text}</p>`)
614605
.join('\n');
615-
616606
const content = `<section class="styles_akomaNtoso__parsed">\n${paragraphs}\n</section>`;
617607
const is_empty = flightFragments.length === 0 || flightFragments.every(f => f.trim() === '');
618-
619608
return { content, is_empty, keywords };
620609
}
621610

622-
// Fallback to DOM parsing for older pages
623611
const dom = new JSDOM(inputHTML);
624612
const doc = dom.window.document;
625613
const section = doc.querySelector('section[class*="akomaNtoso"]');
626-
627614
let is_empty = true;
628-
629615
if (section) {
630616
const paragraphs = section.querySelectorAll('p');
631617
is_empty = !Array.from(paragraphs).some(p => (p.textContent ?? '').trim() !== '');
632618
}
633-
634619
const content = section ? section.outerHTML : '';
635-
636620
return { content, is_empty, keywords };
637621
}
638622

@@ -644,7 +628,6 @@ async function checkIsXMLEmpty(xmlString: string): Promise<boolean> {
644628
const parsed = parser.parse(xmlString);
645629

646630
const body = parsed?.['akomaNtoso']?.['act']?.['body'];
647-
648631
if (!body) return false;
649632

650633
const container = body['hcontainer'];
@@ -657,7 +640,6 @@ async function checkIsXMLEmpty(xmlString: string): Promise<boolean> {
657640
}
658641
}
659642

660-
661643
const baseURL = 'https://opendata.finlex.fi/finlex/avoindata/v1';
662644

663645
async function setImages(statuteUuid: string, docYear: number, docNumber: string, language: string, version: string | null, uris: string[]) {
@@ -715,12 +697,25 @@ async function setSingleStatute(uris : { uri: string, uriOld: string}) {
715697
}
716698
}
717699

700+
const xmlContent = result.data as string;
701+
const isInForce = await parseIsInForceFromXml(xmlContent);
702+
703+
if (isInForce === false) {
704+
const { docYear, docNumber, docLanguage, docVersion } = parseFinlexUrl(uri);
705+
console.log(
706+
`[statute-loader] skipped (isInForce=false): ` +
707+
`${docYear}/${docNumber}/${docLanguage}@${docVersion ?? ''} ` +
708+
`uri=${uri}`
709+
);
710+
return null;
711+
}
712+
// ------------------------------------------------------
713+
718714
const docTitle = await parseTitlefromXML(result)
719715
const imageLinks = await parseImagesfromXML(result)
720716
const keywordList = await parseKeywordsfromXML(result)
721717
const commonNames = await parseCommonNamesFromXML(result)
722718

723-
const xmlContent = result.data as string;
724719
const is_empty = await checkIsXMLEmpty(xmlContent);
725720

726721
const { docYear, docNumber, docLanguage, docVersion } = parseFinlexUrl(uri)

0 commit comments

Comments
 (0)