Fix PDF extraction when MIME type contains charset

MattiSG · web-flow · commit 99aa0e8c5770 · 2025-10-07T14:37:24.000+02:00
#1198 Fixes #915
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,14 @@
 
 All changes that impact users of this module are documented in this file, in the [Common Changelog](https://common-changelog.org) format with some additional specifications defined in the CONTRIBUTING file. This codebase adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## Unreleased [patch]
+
+> Development of this release was supported by the [French Ministry for Foreign Affairs](https://www.diplomatie.gouv.fr/fr/politique-etrangere-de-la-france/diplomatie-numerique/) through its ministerial [State Startups incubator](https://beta.gouv.fr/startups/open-terms-archive.html) under the aegis of the Ambassador for Digital Affairs.
+
+### Fixed
+
+- Increase robustness of PDF content type detection
+
 ## 9.1.0 - 2025-10-01
 
 _Full changeset and discussions: [#1197](https://github.com/OpenTermsArchive/engine/pull/1197)._
diff --git a/src/archivist/extract/index.js b/src/archivist/extract/index.js
@@ -18,7 +18,7 @@ export { ExtractDocumentError } from './errors.js';
  */
 export default async function extract(sourceDocument) {
   try {
-    if (sourceDocument.mimeType == mime.getType('pdf')) {
+    if (mime.getExtension(sourceDocument.mimeType) == 'pdf') {
       return await extractFromPDF(sourceDocument);
     }
 
diff --git a/src/archivist/extract/index.test.js b/src/archivist/extract/index.test.js
@@ -534,6 +534,10 @@ describe('Extract', () => {
         expect(await extract({ content: pdfContent, mimeType: mime.getType('pdf') })).to.equal(expectedExtractedContent);
       });
 
+      it('extracts content from PDF when MIME type includes charset parameter', async () => {
+        expect(await extract({ content: pdfContent, mimeType: 'application/pdf; charset=utf-8' })).to.equal(expectedExtractedContent);
+      });
+
       context('when PDF contains no text', () => {
         it('throws an ExtractDocumentError error', async () => {
           await expect(extract({ content: await fs.readFile(path.resolve(__dirname, '../../../test/fixtures/termsWithoutText.pdf')), mimeType: mime.getType('pdf') })).to.be.rejectedWith(ExtractDocumentError, /contains no text/);

Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@ export { ExtractDocumentError } from './errors.js';`
`18`	`18`	`*/`
`19`	`19`	`export default async function extract(sourceDocument) {`
`20`	`20`	`try {`
`21`		`- if (sourceDocument.mimeType == mime.getType('pdf')) {`
	`21`	`+ if (mime.getExtension(sourceDocument.mimeType) == 'pdf') {`
`22`	`22`	`return await extractFromPDF(sourceDocument);`
`23`	`23`	`}`
`24`	`24`