Skip to content

Commit c09c333

Browse files
authored
Add support for parameterized filters in service declarations and add removeQueryParams built-in filter (#1178)
2 parents 927b58c + 9c37507 commit c09c333

21 files changed

+1716
-395
lines changed

CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,15 @@
22

33
All changes that impact users of this module are documented in this file, in the [Common Changelog](https://common-changelog.org) format with some additional specifications defined in the CONTRIBUTING file. This codebase adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
44

5+
## Unreleased [minor]
6+
7+
> Development of this release was supported by the [French Ministry for Foreign Affairs](https://www.diplomatie.gouv.fr/fr/politique-etrangere-de-la-france/diplomatie-numerique/) through its ministerial [State Startups incubator](https://beta.gouv.fr/startups/open-terms-archive.html) under the aegis of the Ambassador for Digital Affairs.
8+
9+
### Added
10+
11+
- Add support for parameters in filters; see more in the [filters documentation](https://docs.opentermsarchive.org/terms/how-to/apply-filters/)
12+
- Add `removeQueryParams` built-in filter to remove query parameters from links and images; see more in the [built-in filters documentation](https://docs.opentermsarchive.org/terms/reference/built-in-filters/)
13+
514
## 7.1.0 - 2025-09-10
615

716
_Full changeset and discussions: [#1188](https://github.com/OpenTermsArchive/engine/pull/1188)._

README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,12 @@ This codebase is a Node.js module enabling downloading, archiving and publishing
44

55
For documentation, visit [docs.opentermsarchive.org](https://docs.opentermsarchive.org/)
66

7+
## Testing
8+
9+
Use `npm test` to run all tests.
10+
11+
Use `npm run test:only <file.test.js>... [--watch]` to run specific test files. The `--watch` option enables running those tests each time a file changes.
12+
713
- - -
814

915
## Contribute

package-lock.json

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
"start:api": "node bin/ota.js serve",
4646
"start:scheduler": "npm start -- --schedule",
4747
"test": "cross-env NODE_ENV=test mocha --recursive \"./src/**/*.test.js\" \"./scripts/**/*.test.js\" --exit",
48+
"test:only": "cross-env NODE_ENV=test mocha --recursive",
4849
"posttest": "npm run lint",
4950
"test:debug": "npm run test -- --inspect-brk --exit"
5051
},
@@ -122,6 +123,6 @@
122123
"@opentermsarchive/terms-types": "^2.0.0"
123124
},
124125
"engines": {
125-
"node": ">=16.0.0"
126+
"node": ">=16.0.0 < 23.0.0"
126127
}
127128
}

scripts/declarations/validate/definitions.js

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,20 @@ const definitions = {
2929
filters: {
3030
type: 'array',
3131
items: {
32-
type: 'string',
33-
pattern: '^.+$',
34-
description: 'Filter function name',
32+
oneOf: [
33+
{
34+
type: 'string',
35+
pattern: '^.+$',
36+
description: 'Filter function name',
37+
},
38+
{
39+
type: 'object',
40+
description: 'Filter function with parameters. The key is the filter function name, the value is the parameters.',
41+
additionalProperties: false,
42+
minProperties: 1,
43+
maxProperties: 1,
44+
},
45+
],
3546
},
3647
},
3748
validUntil: {

scripts/declarations/validate/index.mocha.js

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import { expect } from 'chai';
77
import config from 'config';
88
import jsonSourceMap from 'json-source-map';
99

10+
import * as exposedFilters from '../../../src/archivist/extract/exposedFilters.js';
1011
import extract from '../../../src/archivist/extract/index.js';
1112
import fetch, { launchHeadlessBrowser, stopHeadlessBrowser } from '../../../src/archivist/fetcher/index.js';
1213
import * as services from '../../../src/archivist/services/index.js';
@@ -76,6 +77,24 @@ export default async options => {
7677
});
7778
}
7879

80+
it('filters do not use reserved names', async () => {
81+
const filtersFilePath = path.join(declarationsPath, `${serviceId}.filters.js`);
82+
83+
if (!fsApi.existsSync(filtersFilePath)) {
84+
return; // Skip if no filters file exists
85+
}
86+
87+
const serviceFilters = await services.loadServiceFilters(serviceId);
88+
const reservedFilterNames = Object.keys(exposedFilters);
89+
const serviceFilterNames = Object.keys(serviceFilters);
90+
91+
const conflictingNames = serviceFilterNames.filter(name => reservedFilterNames.includes(name));
92+
93+
if (conflictingNames.length) {
94+
throw new Error(`Service filter file "${serviceId}.filters.js" declares filters with names used by built-in filters: "${conflictingNames.join('", "')}". Rename these filters to avoid a collision.`);
95+
}
96+
});
97+
7998
if (!schemaOnly && service) {
8099
service.getTermsTypes()
81100
.filter(termsType => {

src/archivist/extract/dom.js

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
import jsdom from 'jsdom';
2+
3+
export default function createWebPageDOM(content, location) {
4+
const { document } = new jsdom.JSDOM(content, {
5+
url: location,
6+
virtualConsole: new jsdom.VirtualConsole(),
7+
}).window;
8+
9+
return Object.assign(document, {
10+
select(contentSelectors) {
11+
const result = document.createDocumentFragment();
12+
let hasContent = false;
13+
14+
[].concat(contentSelectors).forEach(selector => {
15+
if (typeof selector === 'object') {
16+
const rangeSelection = this.selectRange(selector);
17+
const clonedContent = rangeSelection.cloneContents();
18+
19+
if (clonedContent.hasChildNodes()) {
20+
result.appendChild(clonedContent);
21+
hasContent = true;
22+
}
23+
} else {
24+
const elements = document.querySelectorAll(selector);
25+
26+
if (elements.length > 0) {
27+
elements.forEach(element => result.appendChild(element.cloneNode(true)));
28+
hasContent = true;
29+
}
30+
}
31+
});
32+
33+
return hasContent ? result : null;
34+
},
35+
36+
remove(insignificantContentSelectors) {
37+
const rangeSelections = [];
38+
const nodes = [];
39+
40+
[].concat(insignificantContentSelectors).forEach(selector => {
41+
if (typeof selector === 'object') {
42+
rangeSelections.push(this.selectRange(selector));
43+
} else {
44+
nodes.push(...document.querySelectorAll(selector));
45+
}
46+
});
47+
48+
nodes.forEach(node => node.remove());
49+
rangeSelections.forEach(rangeSelection => rangeSelection.deleteContents());
50+
51+
return this;
52+
},
53+
54+
selectRange(rangeSelector) {
55+
const { startBefore, startAfter, endBefore, endAfter } = rangeSelector;
56+
57+
const selection = document.createRange();
58+
const startNode = document.querySelector(startBefore || startAfter);
59+
const endNode = document.querySelector(endBefore || endAfter);
60+
61+
if (!startNode) {
62+
throw new Error(`The "start" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
63+
}
64+
65+
if (!endNode) {
66+
throw new Error(`The "end" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
67+
}
68+
69+
selection[startBefore ? 'setStartBefore' : 'setStartAfter'](startNode);
70+
selection[endBefore ? 'setEndBefore' : 'setEndAfter'](endNode);
71+
72+
return selection;
73+
},
74+
});
75+
}

src/archivist/extract/dom.test.js

Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
import { expect } from 'chai';
2+
3+
import createWebPageDOM from './dom.js';
4+
5+
describe('createWebPageDOM', () => {
6+
const sampleHTML = `
7+
<!DOCTYPE html>
8+
<html>
9+
<head>
10+
<title>Test Document</title>
11+
</head>
12+
<body>
13+
<header id="header">
14+
<h1>Main Title</h1>
15+
<nav class="navigation">
16+
<a href="/home">Home</a>
17+
<a href="/about">About</a>
18+
</nav>
19+
</header>
20+
<main>
21+
<article id="content">
22+
<p class="introduction">Introduction paragraph</p>
23+
<p class="central">Central paragraph</p>
24+
<p class="conclusion">Conclusion paragraph</p>
25+
</article>
26+
<aside class="sidebar">
27+
<div class="widget">Widget content</div>
28+
</aside>
29+
</main>
30+
<footer id="footer">
31+
<p>Footer content</p>
32+
</footer>
33+
</body>
34+
</html>
35+
`;
36+
const location = 'https://example.com/test';
37+
let document;
38+
39+
before(() => {
40+
document = createWebPageDOM(sampleHTML, location);
41+
});
42+
43+
it('creates a DOM document from HTML content', () => {
44+
expect(document.documentElement.tagName).to.equal('HTML');
45+
});
46+
47+
it('sets the document location', () => {
48+
expect(document.location.href).to.equal(location);
49+
});
50+
51+
it('provides access to the DOM API', () => {
52+
const title = document.querySelector('title');
53+
54+
expect(title.textContent).to.equal('Test Document');
55+
});
56+
57+
describe('#select', () => {
58+
it('returns elements using CSS selectors', () => {
59+
const fragment = document.select('p.introduction');
60+
const paragraph = fragment.querySelector('p');
61+
62+
expect(paragraph.textContent).to.equal('Introduction paragraph');
63+
});
64+
65+
it('returns multiple elements using CSS selectors', () => {
66+
const fragment = document.select('p');
67+
const paragraphs = fragment.querySelectorAll('p');
68+
69+
expect(paragraphs.length).to.equal(4);
70+
});
71+
72+
it('returns elements using an array of CSS selectors', () => {
73+
const fragment = document.select([ 'h1', '.introduction' ]);
74+
const heading = fragment.querySelector('h1');
75+
const paragraph = fragment.querySelector('p');
76+
77+
expect(heading.textContent).to.equal('Main Title');
78+
expect(paragraph.textContent).to.equal('Introduction paragraph');
79+
});
80+
81+
it('returns content using a range selector object', () => {
82+
const rangeSelector = {
83+
startAfter: '.introduction',
84+
endBefore: '.conclusion',
85+
};
86+
const fragment = document.select(rangeSelector);
87+
const paragraph = fragment.querySelector('p');
88+
89+
expect(paragraph.textContent).to.equal('Central paragraph');
90+
});
91+
92+
it('returns null when the selector matches no element', () => {
93+
const result = document.select('.nonexistent');
94+
95+
expect(result).to.be.null;
96+
});
97+
});
98+
99+
describe('#remove', () => {
100+
let testDocument;
101+
102+
beforeEach(() => {
103+
testDocument = createWebPageDOM(sampleHTML, location);
104+
});
105+
106+
it('removes elements using CSS selectors', () => {
107+
testDocument.remove('.sidebar');
108+
const sidebar = testDocument.querySelector('.sidebar');
109+
110+
expect(sidebar).to.be.null;
111+
});
112+
113+
it('removes multiple elements using CSS selectors', () => {
114+
testDocument.remove('p');
115+
const paragraphs = testDocument.querySelectorAll('p');
116+
117+
expect(paragraphs.length).to.equal(0);
118+
});
119+
120+
it('removes elements using an array of CSS selectors', () => {
121+
testDocument.remove([ 'nav', '.widget' ]);
122+
const nav = testDocument.querySelector('nav');
123+
const widget = testDocument.querySelector('.widget');
124+
125+
expect(nav).to.be.null;
126+
expect(widget).to.be.null;
127+
});
128+
129+
it('removes content using a range selector object', () => {
130+
const rangeSelector = {
131+
startAfter: '.introduction',
132+
endBefore: '.conclusion',
133+
};
134+
135+
testDocument.remove(rangeSelector);
136+
const bodyParagraph = testDocument.querySelector('.central');
137+
138+
expect(bodyParagraph).to.be.null;
139+
});
140+
});
141+
142+
describe('#selectRange', () => {
143+
it('creates a range using startAfter and endBefore', () => {
144+
const rangeSelector = {
145+
startAfter: '.introduction',
146+
endBefore: '.conclusion',
147+
};
148+
const range = document.selectRange(rangeSelector);
149+
const fragment = range.cloneContents();
150+
const paragraph = fragment.querySelector('p');
151+
152+
expect(paragraph.textContent).to.equal('Central paragraph');
153+
});
154+
155+
it('creates a range using startBefore and endAfter', () => {
156+
const rangeSelector = {
157+
startBefore: '.central',
158+
endAfter: '.central',
159+
};
160+
const range = document.selectRange(rangeSelector);
161+
const fragment = range.cloneContents();
162+
const paragraph = fragment.querySelector('p');
163+
164+
expect(paragraph.textContent).to.equal('Central paragraph');
165+
});
166+
167+
it('throws a clear error when the startBefore selector has no match', () => {
168+
const rangeSelector = {
169+
startBefore: '.nonexistent',
170+
endBefore: '.conclusion',
171+
};
172+
173+
expect(() => document.selectRange(rangeSelector)).to.throw('"start" selector has no match');
174+
expect(() => document.selectRange(rangeSelector)).to.throw(JSON.stringify(rangeSelector));
175+
});
176+
177+
it('throws a clear error when the startAfter selector has no match', () => {
178+
const rangeSelector = {
179+
startAfter: '.nonexistent',
180+
endBefore: '.conclusion',
181+
};
182+
183+
expect(() => document.selectRange(rangeSelector)).to.throw('"start" selector has no match');
184+
expect(() => document.selectRange(rangeSelector)).to.throw(JSON.stringify(rangeSelector));
185+
});
186+
187+
it('throws a clear error when the endBefore selector has no match', () => {
188+
const rangeSelector = {
189+
startAfter: '.introduction',
190+
endBefore: '.nonexistent',
191+
};
192+
193+
expect(() => document.selectRange(rangeSelector)).to.throw('"end" selector has no match');
194+
expect(() => document.selectRange(rangeSelector)).to.throw(JSON.stringify(rangeSelector));
195+
});
196+
197+
it('throws a clear error when the endAfter selector has no match', () => {
198+
const rangeSelector = {
199+
startAfter: '.introduction',
200+
endAfter: '.nonexistent',
201+
};
202+
203+
expect(() => document.selectRange(rangeSelector)).to.throw('"end" selector has no match');
204+
expect(() => document.selectRange(rangeSelector)).to.throw(JSON.stringify(rangeSelector));
205+
});
206+
});
207+
});

0 commit comments

Comments
 (0)