-
Notifications
You must be signed in to change notification settings - Fork 9
Fix/import conference week details #683
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,4 +13,5 @@ docker_volumes/*/* | |
| out | ||
| build | ||
| /.env | ||
| .garden | ||
| .garden | ||
| tsconfig.tsbuildinfo | ||
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,45 @@ | ||
| import { describe, it, expect } from 'vitest'; | ||
| import { buildConfigFrom, config, defaults } from './config.js'; | ||
|
|
||
| describe('config', () => { | ||
| it('provides default values when env not set', () => { | ||
| expect(config.conference.year).toBe(defaults.CONFERENCE_YEAR); | ||
| expect(config.conference.week).toBe(defaults.CONFERENCE_WEEK); | ||
| expect(config.conference.limit).toBe(defaults.CONFERENCE_LIMIT); | ||
| expect(config.crawl.maxRequestsPerCrawl).toBe(defaults.CRAWL_MAX_REQUESTS_PER_CRAWL); | ||
| expect(typeof config.db.url).toBe('string'); | ||
| }); | ||
|
|
||
| it('overrides values from environment via buildConfigFrom', () => { | ||
| const custom = buildConfigFrom({ | ||
| CONFERENCE_YEAR: '2030', | ||
| CONFERENCE_WEEK: '12', | ||
| CONFERENCE_LIMIT: '25', | ||
| CRAWL_MAX_REQUESTS_PER_CRAWL: '42', | ||
| TEST: '1', | ||
| DB_URL: 'mongodb://example/db', | ||
| }); | ||
| expect(custom.conference.year).toBe(2030); | ||
| expect(custom.conference.week).toBe(12); | ||
| expect(custom.conference.limit).toBe(25); | ||
| expect(custom.crawl.maxRequestsPerCrawl).toBe(42); | ||
| expect(custom.runtime.isTest).toBe(true); | ||
| expect(custom.db.url).toBe('mongodb://example/db'); | ||
| }); | ||
|
|
||
| it('throws on invalid integer', () => { | ||
| expect(() => buildConfigFrom({ CONFERENCE_WEEK: 'not-a-number' })).toThrow(/Invalid integer/); | ||
| }); | ||
|
|
||
| it('is immutable', () => { | ||
| expect(Object.isFrozen(config)).toBe(true); | ||
| const originalWeek = config.conference.week; | ||
| try { | ||
| (config.conference as unknown as { week: number }).week = originalWeek + 1000; | ||
| } catch { | ||
| // ignore | ||
| } | ||
| // Value must remain unchanged | ||
| expect(config.conference.week).toBe(originalWeek); | ||
| }); | ||
| }); |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,114 @@ | ||
| /** | ||
| * Central configuration module for the import-conference-week-details service. | ||
| * | ||
| * All access to process.env MUST go through this file so that: | ||
| * - The external (environment) interface is explicit and documented | ||
| * - Defaults and validation are applied in one place | ||
| * - Tests can reliably override configuration | ||
| * - Future refactors (e.g. to use dotenv, secret managers) are localized | ||
| */ | ||
|
|
||
| // Define the raw shape of required & optional environment variables | ||
| interface RawEnv { | ||
| CONFERENCE_YEAR?: string; | ||
| CONFERENCE_WEEK?: string; | ||
| CONFERENCE_LIMIT?: string; | ||
| CRAWL_MAX_REQUESTS_PER_CRAWL?: string; | ||
| DB_URL?: string; | ||
| TEST?: string; // presence indicates test mode | ||
| } | ||
|
|
||
| // Public, typed configuration shape consumed by the application | ||
| export interface AppConfig { | ||
| conference: { | ||
| year: number; // Year to start crawling conference weeks | ||
| week: number; // Week number to start crawling | ||
| limit: number; // Pagination limit or item limit when requesting detail page | ||
| }; | ||
| crawl: { | ||
| maxRequestsPerCrawl: number; // Upper bound of requests per crawl run | ||
| }; | ||
| db: { | ||
| url: string; // Mongo connection string (only used outside of TEST mode) | ||
| }; | ||
| runtime: { | ||
| isTest: boolean; // Indicates test mode (skips DB interaction etc.) | ||
| }; | ||
| } | ||
|
|
||
| // Defaults centralised here for easy visibility & single source of truth | ||
| const DEFAULTS = Object.freeze({ | ||
| CONFERENCE_YEAR: 2025, | ||
| CONFERENCE_WEEK: 37, | ||
| CONFERENCE_LIMIT: 10, | ||
| CRAWL_MAX_REQUESTS_PER_CRAWL: 10, | ||
| DB_URL: 'mongodb://localhost:27017/bundestagio', | ||
| }); | ||
|
|
||
| // Minimal integer parser with safe fallback | ||
| const parseIntSafe = (value: string | undefined, fallback: number, fieldName: string): number => { | ||
| if (value === undefined || value === null || value === '') return fallback; | ||
| const parsed = Number.parseInt(value, 10); | ||
| if (Number.isNaN(parsed)) { | ||
| throw new Error(`Invalid integer for ${fieldName}: '${value}'`); | ||
| } | ||
| return parsed; | ||
| }; | ||
|
|
||
| // Extract & freeze raw env (shallow) to prevent mutation during runtime | ||
| const rawEnv: RawEnv = Object.freeze({ | ||
| CONFERENCE_YEAR: process.env.CONFERENCE_YEAR, | ||
| CONFERENCE_WEEK: process.env.CONFERENCE_WEEK, | ||
| CONFERENCE_LIMIT: process.env.CONFERENCE_LIMIT, | ||
| CRAWL_MAX_REQUESTS_PER_CRAWL: process.env.CRAWL_MAX_REQUESTS_PER_CRAWL, | ||
| DB_URL: process.env.DB_URL, | ||
| TEST: process.env.TEST, | ||
| }); | ||
|
|
||
| // Recursively freeze an object (simple deep freeze for plain objects/arrays) | ||
| const deepFreeze = <T>(obj: T): T => { | ||
| if (obj && typeof obj === 'object' && !Object.isFrozen(obj)) { | ||
| Object.freeze(obj); | ||
| for (const key of Object.getOwnPropertyNames(obj)) { | ||
| const value = (obj as Record<string, unknown>)[key]; | ||
| if (value && typeof value === 'object') deepFreeze(value); | ||
| } | ||
| } | ||
| return obj; | ||
| }; | ||
|
|
||
| // Build the typed configuration object | ||
| const buildConfig = (env: RawEnv): AppConfig => | ||
| ({ | ||
| conference: { | ||
| year: parseIntSafe(env.CONFERENCE_YEAR, DEFAULTS.CONFERENCE_YEAR, 'CONFERENCE_YEAR'), | ||
| week: parseIntSafe(env.CONFERENCE_WEEK, DEFAULTS.CONFERENCE_WEEK, 'CONFERENCE_WEEK'), | ||
| limit: parseIntSafe(env.CONFERENCE_LIMIT, DEFAULTS.CONFERENCE_LIMIT, 'CONFERENCE_LIMIT'), | ||
| }, | ||
| crawl: { | ||
| maxRequestsPerCrawl: parseIntSafe( | ||
| env.CRAWL_MAX_REQUESTS_PER_CRAWL, | ||
| DEFAULTS.CRAWL_MAX_REQUESTS_PER_CRAWL, | ||
| 'CRAWL_MAX_REQUESTS_PER_CRAWL', | ||
| ), | ||
| }, | ||
| db: { | ||
| url: env.DB_URL || DEFAULTS.DB_URL, | ||
| }, | ||
| runtime: { | ||
| isTest: Boolean(env.TEST), | ||
| }, | ||
| }) as AppConfig; | ||
|
|
||
| export const config: AppConfig = deepFreeze(buildConfig(rawEnv)); | ||
|
|
||
| // Convenience accessor (pure) – mainly useful for tests when mocking | ||
| export const getConfig = (): AppConfig => config; | ||
|
|
||
| // For advanced usage one could export a function to rebuild config from custom env | ||
| export const buildConfigFrom = (partial: Partial<RawEnv>): AppConfig => buildConfig({ ...rawEnv, ...partial }); | ||
|
|
||
| // Re-export defaults for documentation / potential external tooling | ||
| export const defaults = DEFAULTS; | ||
|
|
||
| export type { RawEnv }; | ||
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -1,5 +1,6 @@ | ||||||
| import { createCheerioRouter, CheerioCrawlingContext } from 'crawlee'; | ||||||
| import { extractEntryUrls, extractNavigationData, extractSessionInfo } from './services/html-parser.js'; | ||||||
| import { extractNavigationData, extractSessionInfo } from './services/html-parser.js'; | ||||||
| import { config } from './config.js'; | ||||||
| import { processConferenceWeekDetailUrl } from './utils/url.js'; | ||||||
| import { ConferenceWeekDetail } from './types.js'; | ||||||
|
|
||||||
|
|
@@ -43,29 +44,21 @@ export const startHandler = async ({ $, request, enqueueLinks, log, response }: | |||||
| throw new Error(`Failed to fetch start URL: ${request.url}`); | ||||||
| } | ||||||
|
|
||||||
| // Extract conference week URLs | ||||||
| const entryUrls = extractEntryUrls($); | ||||||
|
|
||||||
| if (entryUrls.length > 0) { | ||||||
| // Enqueue all detail URLs with a label | ||||||
| for (const relativeUrl of entryUrls) { | ||||||
| const absoluteUrl = new URL(relativeUrl, 'https://www.bundestag.de').href; | ||||||
|
|
||||||
| // Skip already processed URLs | ||||||
| if (processedUrls.has(absoluteUrl)) continue; | ||||||
|
|
||||||
| await enqueueLinks({ | ||||||
| urls: [absoluteUrl], | ||||||
| label: 'DETAIL', | ||||||
| userData: { | ||||||
| sourceUrl: request.url, | ||||||
| }, | ||||||
| }); | ||||||
| } | ||||||
| log.info(`Enqueued ${entryUrls.length} conference week detail URLs`); | ||||||
| } else { | ||||||
| log.warning('No conference week URLs found on the start page'); | ||||||
| } | ||||||
| // Build initial detail URL from central configuration | ||||||
| const absoluteUrl = new URL( | ||||||
| `/apps/plenar/plenar/conferenceweekDetail.form?year=${config.conference.year}&week=${config.conference.week}&limit=${config.conference.limit}`, | ||||||
| 'https://www.bundestag.de', | ||||||
|
||||||
| 'https://www.bundestag.de', | |
| config.conference.baseUrl, |
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[nitpick] The deepFreeze function implements a generic recursive freezing utility that could be reused across the codebase. Consider extracting this to a shared utility module to follow the DRY principle and improve maintainability.