Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ docker_volumes/*/*
out
build
/.env
.garden
.garden
tsconfig.tsbuildinfo
1 change: 0 additions & 1 deletion bundestag.io/admin/tsconfig.tsbuildinfo

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import { describe, it, expect } from 'vitest';
import { buildConfigFrom, config, defaults } from './config.js';

describe('config', () => {
it('provides default values when env not set', () => {
expect(config.conference.year).toBe(defaults.CONFERENCE_YEAR);
expect(config.conference.week).toBe(defaults.CONFERENCE_WEEK);
expect(config.conference.limit).toBe(defaults.CONFERENCE_LIMIT);
expect(config.crawl.maxRequestsPerCrawl).toBe(defaults.CRAWL_MAX_REQUESTS_PER_CRAWL);
expect(typeof config.db.url).toBe('string');
});

it('overrides values from environment via buildConfigFrom', () => {
const custom = buildConfigFrom({
CONFERENCE_YEAR: '2030',
CONFERENCE_WEEK: '12',
CONFERENCE_LIMIT: '25',
CRAWL_MAX_REQUESTS_PER_CRAWL: '42',
TEST: '1',
DB_URL: 'mongodb://example/db',
});
expect(custom.conference.year).toBe(2030);
expect(custom.conference.week).toBe(12);
expect(custom.conference.limit).toBe(25);
expect(custom.crawl.maxRequestsPerCrawl).toBe(42);
expect(custom.runtime.isTest).toBe(true);
expect(custom.db.url).toBe('mongodb://example/db');
});

it('throws on invalid integer', () => {
expect(() => buildConfigFrom({ CONFERENCE_WEEK: 'not-a-number' })).toThrow(/Invalid integer/);
});

it('is immutable', () => {
expect(Object.isFrozen(config)).toBe(true);
const originalWeek = config.conference.week;
try {
(config.conference as unknown as { week: number }).week = originalWeek + 1000;
} catch {
// ignore
}
// Value must remain unchanged
expect(config.conference.week).toBe(originalWeek);
});
});
114 changes: 114 additions & 0 deletions services/cron-jobs/import-conference-week-details/src/config.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
/**
* Central configuration module for the import-conference-week-details service.
*
* All access to process.env MUST go through this file so that:
* - The external (environment) interface is explicit and documented
* - Defaults and validation are applied in one place
* - Tests can reliably override configuration
* - Future refactors (e.g. to use dotenv, secret managers) are localized
*/

// Define the raw shape of required & optional environment variables
interface RawEnv {
CONFERENCE_YEAR?: string;
CONFERENCE_WEEK?: string;
CONFERENCE_LIMIT?: string;
CRAWL_MAX_REQUESTS_PER_CRAWL?: string;
DB_URL?: string;
TEST?: string; // presence indicates test mode
}

// Public, typed configuration shape consumed by the application
export interface AppConfig {
conference: {
year: number; // Year to start crawling conference weeks
week: number; // Week number to start crawling
limit: number; // Pagination limit or item limit when requesting detail page
};
crawl: {
maxRequestsPerCrawl: number; // Upper bound of requests per crawl run
};
db: {
url: string; // Mongo connection string (only used outside of TEST mode)
};
runtime: {
isTest: boolean; // Indicates test mode (skips DB interaction etc.)
};
}

// Defaults centralised here for easy visibility & single source of truth
const DEFAULTS = Object.freeze({
CONFERENCE_YEAR: 2025,
CONFERENCE_WEEK: 37,
CONFERENCE_LIMIT: 10,
CRAWL_MAX_REQUESTS_PER_CRAWL: 10,
DB_URL: 'mongodb://localhost:27017/bundestagio',
});

// Minimal integer parser with safe fallback
const parseIntSafe = (value: string | undefined, fallback: number, fieldName: string): number => {
if (value === undefined || value === null || value === '') return fallback;
const parsed = Number.parseInt(value, 10);
if (Number.isNaN(parsed)) {
throw new Error(`Invalid integer for ${fieldName}: '${value}'`);
}
return parsed;
};

// Extract & freeze raw env (shallow) to prevent mutation during runtime
const rawEnv: RawEnv = Object.freeze({
CONFERENCE_YEAR: process.env.CONFERENCE_YEAR,
CONFERENCE_WEEK: process.env.CONFERENCE_WEEK,
CONFERENCE_LIMIT: process.env.CONFERENCE_LIMIT,
CRAWL_MAX_REQUESTS_PER_CRAWL: process.env.CRAWL_MAX_REQUESTS_PER_CRAWL,
DB_URL: process.env.DB_URL,
TEST: process.env.TEST,
});

// Recursively freeze an object (simple deep freeze for plain objects/arrays)
const deepFreeze = <T>(obj: T): T => {
if (obj && typeof obj === 'object' && !Object.isFrozen(obj)) {
Object.freeze(obj);
for (const key of Object.getOwnPropertyNames(obj)) {
const value = (obj as Record<string, unknown>)[key];
if (value && typeof value === 'object') deepFreeze(value);
}
}
return obj;
};
Comment on lines +68 to +78
Copy link

Copilot AI Sep 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The deepFreeze function implements a generic recursive freezing utility that could be reused across the codebase. Consider extracting this to a shared utility module to follow the DRY principle and improve maintainability.

Copilot uses AI. Check for mistakes.

// Build the typed configuration object
const buildConfig = (env: RawEnv): AppConfig =>
({
conference: {
year: parseIntSafe(env.CONFERENCE_YEAR, DEFAULTS.CONFERENCE_YEAR, 'CONFERENCE_YEAR'),
week: parseIntSafe(env.CONFERENCE_WEEK, DEFAULTS.CONFERENCE_WEEK, 'CONFERENCE_WEEK'),
limit: parseIntSafe(env.CONFERENCE_LIMIT, DEFAULTS.CONFERENCE_LIMIT, 'CONFERENCE_LIMIT'),
},
crawl: {
maxRequestsPerCrawl: parseIntSafe(
env.CRAWL_MAX_REQUESTS_PER_CRAWL,
DEFAULTS.CRAWL_MAX_REQUESTS_PER_CRAWL,
'CRAWL_MAX_REQUESTS_PER_CRAWL',
),
},
db: {
url: env.DB_URL || DEFAULTS.DB_URL,
},
runtime: {
isTest: Boolean(env.TEST),
},
}) as AppConfig;

export const config: AppConfig = deepFreeze(buildConfig(rawEnv));

// Convenience accessor (pure) – mainly useful for tests when mocking
export const getConfig = (): AppConfig => config;

// For advanced usage one could export a function to rebuild config from custom env
export const buildConfigFrom = (partial: Partial<RawEnv>): AppConfig => buildConfig({ ...rawEnv, ...partial });

// Re-export defaults for documentation / potential external tooling
export const defaults = DEFAULTS;

export type { RawEnv };
Copy link

Copilot AI Sep 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The RawEnv type export at the end of the file is inconsistent with the export placement pattern used elsewhere in the file. Consider moving this export closer to the RawEnv interface definition (around line 19) for better code organization.

Copilot uses AI. Check for mistakes.
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
import { CheerioCrawler, RequestQueue } from 'crawlee';
import { CrawlerConfig, ConferenceWeekDetail } from './types';
import { router } from './routes';
import { config } from './config.js';

export const DEFAULT_CONFIG: CrawlerConfig = {
baseUrl: 'https://www.bundestag.de/tagesordnung',
maxConcurrency: 1,
retryOnBlocked: true,
maxRequestRetries: 10,
maxRequestsPerMinute: 60,
maxRequestsPerCrawl: process.env.CRAWL_MAX_REQUESTS_PER_CRAWL
? parseInt(process.env.CRAWL_MAX_REQUESTS_PER_CRAWL)
: 10,
maxRequestsPerCrawl: config.crawl.maxRequestsPerCrawl,
};

/**
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Main entry point with error handling
import { log } from 'crawlee';
import { main } from './main.js';
import { config } from './config.js';
import { getResults } from './routes.js';
import { ConferenceWeekDetailModel, mongoConnect, ProcedureModel } from '@democracy-deutschland/bundestagio-common';
import { IConferenceWeekDetail } from '@democracy-deutschland/bundestagio-common/dist/models/ConferenceWeekDetail/schema.js';
Expand Down Expand Up @@ -145,8 +146,8 @@ const getProcedureIds = async (documents: string[]) => {

export async function run(): Promise<void> {
try {
if (!process.env.TEST) {
await mongoConnect(process.env.DB_URL || 'mongodb://localhost:27017/bundestagio');
if (!config.runtime.isTest) {
await mongoConnect(config.db.url);
}
// Run the crawler
await main();
Expand All @@ -156,7 +157,7 @@ export async function run(): Promise<void> {
log.info('Fetched conference weeks:', results);

// if test return here
if (process.env.TEST) {
if (config.runtime.isTest) {
log.info('Test mode: Skipping MongoDB save');
return;
}
Expand Down
41 changes: 17 additions & 24 deletions services/cron-jobs/import-conference-week-details/src/routes.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { createCheerioRouter, CheerioCrawlingContext } from 'crawlee';
import { extractEntryUrls, extractNavigationData, extractSessionInfo } from './services/html-parser.js';
import { extractNavigationData, extractSessionInfo } from './services/html-parser.js';
import { config } from './config.js';
import { processConferenceWeekDetailUrl } from './utils/url.js';
import { ConferenceWeekDetail } from './types.js';

Expand Down Expand Up @@ -43,29 +44,21 @@ export const startHandler = async ({ $, request, enqueueLinks, log, response }:
throw new Error(`Failed to fetch start URL: ${request.url}`);
}

// Extract conference week URLs
const entryUrls = extractEntryUrls($);

if (entryUrls.length > 0) {
// Enqueue all detail URLs with a label
for (const relativeUrl of entryUrls) {
const absoluteUrl = new URL(relativeUrl, 'https://www.bundestag.de').href;

// Skip already processed URLs
if (processedUrls.has(absoluteUrl)) continue;

await enqueueLinks({
urls: [absoluteUrl],
label: 'DETAIL',
userData: {
sourceUrl: request.url,
},
});
}
log.info(`Enqueued ${entryUrls.length} conference week detail URLs`);
} else {
log.warning('No conference week URLs found on the start page');
}
// Build initial detail URL from central configuration
const absoluteUrl = new URL(
`/apps/plenar/plenar/conferenceweekDetail.form?year=${config.conference.year}&week=${config.conference.week}&limit=${config.conference.limit}`,
'https://www.bundestag.de',
Copy link

Copilot AI Sep 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The base URL 'https://www.bundestag.de' is hardcoded here but also appears in the crawler configuration. Consider moving this to the config module to maintain a single source of truth and improve maintainability.

Suggested change
'https://www.bundestag.de',
config.conference.baseUrl,

Copilot uses AI. Check for mistakes.
).href;

await enqueueLinks({
urls: [absoluteUrl],
label: 'DETAIL',
userData: {
sourceUrl: request.url,
},
});

log.info(`Enqueued conference week detail URLs`);

// Mark the start URL as processed after we've handled it
processedUrls.add(request.url);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,32 +1,11 @@
import { describe, it, expect } from 'vitest';
import cheerio from 'cheerio';
import {
isValidConferenceWeekUrl,
extractEntryUrls,
extractNavigationData,
extractTopItems,
extractDocumentId,
getMonthNumber,
} from './html-parser';
import { extractNavigationData, extractTopItems, extractDocumentId, getMonthNumber } from './html-parser';

describe('HTML Parser - Main Integration Tests', () => {
// These tests ensure that the main html-parser.ts export
// correctly forwards all the functions from the individual modules

describe('URL Parser', () => {
it('should validate conference week URLs', () => {
expect(isValidConferenceWeekUrl('/apps/plenar/plenar/conferenceweekDetail.form')).toBe(true);
});

it('should extract entry URLs', () => {
const html =
'<div class="bt-module-row-sitzungsablauf" data-dataloader-url="/apps/plenar/plenar/conferenceweekDetail.form"></div>';
const $ = cheerio.load(html);
const results = extractEntryUrls($);
expect(results).toEqual(['/apps/plenar/plenar/conferenceweekDetail.form']);
});
});

describe('Navigation Parser', () => {
it('should extract navigation data', () => {
const html = `
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,6 @@ import { describe, it, expect } from 'vitest';
import * as parsers from '../index';

describe('Parser Modules Integration', () => {
it('should export all URL parser functions', () => {
expect(parsers.isValidConferenceWeekUrl).toBeDefined();
expect(parsers.getEntryPageUrl).toBeDefined();
expect(parsers.extractEntryUrls).toBeDefined();
});

it('should export all navigation parser functions', () => {
expect(parsers.extractNavigationData).toBeDefined();
});
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
// Re-export all parser functions
export * from './url-parser';
export * from './navigation-parser';
export * from './session-parser';
export * from './topic-parser';
Expand Down
Loading
Loading