Skip to content

Commit bc19b2c

Browse files
committed
feat: add playwright script to download current website pdfs
1 parent 2a61add commit bc19b2c

File tree

4 files changed

+494
-1
lines changed

4 files changed

+494
-1
lines changed

package.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,12 @@
2626
"fuse-post-process": "node scripts/fuse-post-process.mjs",
2727
"build": "pnpm run setup-assets && astro build",
2828
"preview": "astro preview",
29-
"setup-assets": "node scripts/setup-assets.mjs"
29+
"setup-assets": "node scripts/setup-assets.mjs",
30+
"download-pdfs": "node scripts/download-pdfs.mjs"
3031
},
3132
"devDependencies": {
3233
"@types/node": "^24.3.0",
34+
"playwright": "^1.56.1",
3335
"sharp": "^0.34.3",
3436
"yaml": "^2.8.1"
3537
}

pnpm-lock.yaml

Lines changed: 29 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

scripts/download-pdfs-test.mjs

Lines changed: 231 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,231 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// Copyright © 2025 NumFOCUS, Inc. for the Insight Software Consortium
3+
4+
/**
5+
* Test script to download a few article PDFs from the Insight Journal website
6+
* Downloads PDFs from publication IDs 9 to 11 for testing
7+
*/
8+
9+
import { chromium } from "playwright";
10+
import { mkdir } from "fs/promises";
11+
import { dirname, join } from "path";
12+
import { fileURLToPath } from "url";
13+
14+
const __filename = fileURLToPath(import.meta.url);
15+
const __dirname = dirname(__filename);
16+
17+
// Configuration - TEST VERSION WITH SMALL RANGE
18+
const START_ID = 9;
19+
const END_ID = 11; // Only download 3 PDFs for testing
20+
const BASE_URL = "https://insight-journal.org/browse/publication";
21+
const PDF_DIR = join(__dirname, "..", "archive", "pdfs");
22+
const TIMEOUT = 60000; // 60 seconds timeout for navigation and downloads
23+
const WAIT_AFTER_CLICK = 3000; // 3 seconds wait after clicking Article tab
24+
25+
/**
26+
* Ensure the PDF directory exists
27+
*/
28+
async function ensurePdfDirectory() {
29+
await mkdir(PDF_DIR, { recursive: true });
30+
console.log(`📁 PDF directory ready: ${PDF_DIR}`);
31+
}
32+
33+
/**
34+
* Download a single PDF for a given publication ID
35+
*/
36+
async function downloadPdf(page, insightJournalId) {
37+
const url = `${BASE_URL}/${insightJournalId}`;
38+
const pdfPath = join(PDF_DIR, `${insightJournalId}.pdf`);
39+
40+
try {
41+
console.log(`\n🔍 Processing publication ${insightJournalId}...`);
42+
43+
// Navigate to the publication page
44+
console.log(` Navigating to ${url}`);
45+
await page.goto(url, {
46+
waitUntil: "domcontentloaded",
47+
timeout: TIMEOUT,
48+
});
49+
50+
// Wait for the page to be ready
51+
await page.waitForLoadState("networkidle", { timeout: TIMEOUT });
52+
53+
// Click on the "Article" tab
54+
console.log(` Clicking Article tab...`);
55+
const articleTab = page.getByRole("tab", { name: "Article" });
56+
57+
// Check if Article tab exists
58+
const articleTabExists = (await articleTab.count()) > 0;
59+
if (!articleTabExists) {
60+
console.log(
61+
` ⚠️ Article tab not found for publication ${insightJournalId}`
62+
);
63+
return {
64+
success: false,
65+
id: insightJournalId,
66+
error: "Article tab not found",
67+
};
68+
}
69+
70+
await articleTab.click();
71+
72+
// Wait for the Article content to load
73+
console.log(` Waiting for content to load...`);
74+
await page.waitForTimeout(WAIT_AFTER_CLICK);
75+
76+
// Try to wait for the Download PDF button
77+
try {
78+
await page.getByRole("button", { name: "Download PDF" }).waitFor({
79+
state: "visible",
80+
timeout: TIMEOUT,
81+
});
82+
} catch (err) {
83+
console.log(
84+
` ⚠️ Download PDF button not found for publication ${insightJournalId}`
85+
);
86+
return {
87+
success: false,
88+
id: insightJournalId,
89+
error: "Download PDF button not found",
90+
};
91+
}
92+
93+
// Start waiting for download before clicking
94+
console.log(` Initiating PDF download...`);
95+
const downloadPromise = page.waitForEvent("download", { timeout: TIMEOUT });
96+
97+
// Click the Download PDF button
98+
await page.getByRole("button", { name: "Download PDF" }).click();
99+
100+
// Wait for the download to complete
101+
const download = await downloadPromise;
102+
103+
// Save the downloaded file
104+
await download.saveAs(pdfPath);
105+
console.log(` ✅ Downloaded: ${pdfPath}`);
106+
107+
return { success: true, id: insightJournalId };
108+
} catch (error) {
109+
console.error(
110+
` ❌ Error downloading publication ${insightJournalId}:`,
111+
error.message
112+
);
113+
return { success: false, id: insightJournalId, error: error.message };
114+
}
115+
}
116+
117+
/**
118+
* Navigate to the next publication
119+
*/
120+
async function goToNextPublication(page) {
121+
try {
122+
// Find and click the "Next publication" button
123+
const nextButton = page.getByLabel("Next publication").getByRole("button");
124+
const nextButtonExists = (await nextButton.count()) > 0;
125+
126+
if (!nextButtonExists) {
127+
console.log(` ⚠️ Next publication button not found`);
128+
return false;
129+
}
130+
131+
await nextButton.click();
132+
133+
// Wait for navigation to complete
134+
await page.waitForLoadState("networkidle", { timeout: TIMEOUT });
135+
136+
return true;
137+
} catch (error) {
138+
console.error(` ❌ Error navigating to next publication:`, error.message);
139+
return false;
140+
}
141+
}
142+
143+
/**
144+
* Main function to download all PDFs
145+
*/
146+
async function main() {
147+
console.log("🚀 Starting PDF download TEST...");
148+
console.log(` Range: Publication ${START_ID} to ${END_ID} (TEST)`);
149+
150+
// Ensure the PDF directory exists
151+
await ensurePdfDirectory();
152+
153+
// Launch browser
154+
console.log("\n🌐 Launching browser...");
155+
const browser = await chromium.launch({
156+
headless: true,
157+
timeout: TIMEOUT,
158+
});
159+
160+
const context = await browser.newContext({
161+
acceptDownloads: true,
162+
});
163+
164+
const page = await context.newPage();
165+
166+
// Track results
167+
const results = {
168+
successful: [],
169+
failed: [],
170+
skipped: [],
171+
};
172+
173+
// Start from the first publication
174+
let currentId = START_ID;
175+
176+
try {
177+
while (currentId <= END_ID) {
178+
const result = await downloadPdf(page, currentId);
179+
180+
if (result.success) {
181+
results.successful.push(result.id);
182+
} else {
183+
results.failed.push({ id: result.id, error: result.error });
184+
}
185+
186+
// If we haven't reached the end, try to navigate to the next publication
187+
if (currentId < END_ID) {
188+
console.log(` 📄 Moving to next publication...`);
189+
const navigated = await goToNextPublication(page);
190+
191+
if (!navigated) {
192+
console.log(
193+
` ⚠️ Could not navigate to next publication. Moving to next ID manually...`
194+
);
195+
currentId++;
196+
} else {
197+
// Increment ID for next iteration
198+
currentId++;
199+
}
200+
201+
// Small delay between downloads to be respectful to the server
202+
await page.waitForTimeout(1000);
203+
} else {
204+
break;
205+
}
206+
}
207+
} catch (error) {
208+
console.error("\n❌ Fatal error:", error);
209+
} finally {
210+
await browser.close();
211+
}
212+
213+
// Print summary
214+
console.log("\n" + "=".repeat(60));
215+
console.log("📊 DOWNLOAD SUMMARY");
216+
console.log("=".repeat(60));
217+
console.log(`✅ Successful: ${results.successful.length}`);
218+
console.log(`❌ Failed: ${results.failed.length}`);
219+
220+
if (results.failed.length > 0) {
221+
console.log("\n❌ Failed publications:");
222+
results.failed.forEach(({ id, error }) => {
223+
console.log(` - Publication ${id}: ${error}`);
224+
});
225+
}
226+
227+
console.log("\n✨ Test completed!");
228+
}
229+
230+
// Run the script
231+
main().catch(console.error);

0 commit comments

Comments
 (0)