|
| 1 | +// SPDX-License-Identifier: Apache-2.0 |
| 2 | +// Copyright © 2025 NumFOCUS, Inc. for the Insight Software Consortium |
| 3 | + |
| 4 | +/** |
| 5 | + * Test script to download a few article PDFs from the Insight Journal website |
| 6 | + * Downloads PDFs from publication IDs 9 to 11 for testing |
| 7 | + */ |
| 8 | + |
| 9 | +import { chromium } from "playwright"; |
| 10 | +import { mkdir } from "fs/promises"; |
| 11 | +import { dirname, join } from "path"; |
| 12 | +import { fileURLToPath } from "url"; |
| 13 | + |
| 14 | +const __filename = fileURLToPath(import.meta.url); |
| 15 | +const __dirname = dirname(__filename); |
| 16 | + |
| 17 | +// Configuration - TEST VERSION WITH SMALL RANGE |
| 18 | +const START_ID = 9; |
| 19 | +const END_ID = 11; // Only download 3 PDFs for testing |
| 20 | +const BASE_URL = "https://insight-journal.org/browse/publication"; |
| 21 | +const PDF_DIR = join(__dirname, "..", "archive", "pdfs"); |
| 22 | +const TIMEOUT = 60000; // 60 seconds timeout for navigation and downloads |
| 23 | +const WAIT_AFTER_CLICK = 3000; // 3 seconds wait after clicking Article tab |
| 24 | + |
| 25 | +/** |
| 26 | + * Ensure the PDF directory exists |
| 27 | + */ |
| 28 | +async function ensurePdfDirectory() { |
| 29 | + await mkdir(PDF_DIR, { recursive: true }); |
| 30 | + console.log(`📁 PDF directory ready: ${PDF_DIR}`); |
| 31 | +} |
| 32 | + |
| 33 | +/** |
| 34 | + * Download a single PDF for a given publication ID |
| 35 | + */ |
| 36 | +async function downloadPdf(page, insightJournalId) { |
| 37 | + const url = `${BASE_URL}/${insightJournalId}`; |
| 38 | + const pdfPath = join(PDF_DIR, `${insightJournalId}.pdf`); |
| 39 | + |
| 40 | + try { |
| 41 | + console.log(`\n🔍 Processing publication ${insightJournalId}...`); |
| 42 | + |
| 43 | + // Navigate to the publication page |
| 44 | + console.log(` Navigating to ${url}`); |
| 45 | + await page.goto(url, { |
| 46 | + waitUntil: "domcontentloaded", |
| 47 | + timeout: TIMEOUT, |
| 48 | + }); |
| 49 | + |
| 50 | + // Wait for the page to be ready |
| 51 | + await page.waitForLoadState("networkidle", { timeout: TIMEOUT }); |
| 52 | + |
| 53 | + // Click on the "Article" tab |
| 54 | + console.log(` Clicking Article tab...`); |
| 55 | + const articleTab = page.getByRole("tab", { name: "Article" }); |
| 56 | + |
| 57 | + // Check if Article tab exists |
| 58 | + const articleTabExists = (await articleTab.count()) > 0; |
| 59 | + if (!articleTabExists) { |
| 60 | + console.log( |
| 61 | + ` ⚠️ Article tab not found for publication ${insightJournalId}` |
| 62 | + ); |
| 63 | + return { |
| 64 | + success: false, |
| 65 | + id: insightJournalId, |
| 66 | + error: "Article tab not found", |
| 67 | + }; |
| 68 | + } |
| 69 | + |
| 70 | + await articleTab.click(); |
| 71 | + |
| 72 | + // Wait for the Article content to load |
| 73 | + console.log(` Waiting for content to load...`); |
| 74 | + await page.waitForTimeout(WAIT_AFTER_CLICK); |
| 75 | + |
| 76 | + // Try to wait for the Download PDF button |
| 77 | + try { |
| 78 | + await page.getByRole("button", { name: "Download PDF" }).waitFor({ |
| 79 | + state: "visible", |
| 80 | + timeout: TIMEOUT, |
| 81 | + }); |
| 82 | + } catch (err) { |
| 83 | + console.log( |
| 84 | + ` ⚠️ Download PDF button not found for publication ${insightJournalId}` |
| 85 | + ); |
| 86 | + return { |
| 87 | + success: false, |
| 88 | + id: insightJournalId, |
| 89 | + error: "Download PDF button not found", |
| 90 | + }; |
| 91 | + } |
| 92 | + |
| 93 | + // Start waiting for download before clicking |
| 94 | + console.log(` Initiating PDF download...`); |
| 95 | + const downloadPromise = page.waitForEvent("download", { timeout: TIMEOUT }); |
| 96 | + |
| 97 | + // Click the Download PDF button |
| 98 | + await page.getByRole("button", { name: "Download PDF" }).click(); |
| 99 | + |
| 100 | + // Wait for the download to complete |
| 101 | + const download = await downloadPromise; |
| 102 | + |
| 103 | + // Save the downloaded file |
| 104 | + await download.saveAs(pdfPath); |
| 105 | + console.log(` ✅ Downloaded: ${pdfPath}`); |
| 106 | + |
| 107 | + return { success: true, id: insightJournalId }; |
| 108 | + } catch (error) { |
| 109 | + console.error( |
| 110 | + ` ❌ Error downloading publication ${insightJournalId}:`, |
| 111 | + error.message |
| 112 | + ); |
| 113 | + return { success: false, id: insightJournalId, error: error.message }; |
| 114 | + } |
| 115 | +} |
| 116 | + |
| 117 | +/** |
| 118 | + * Navigate to the next publication |
| 119 | + */ |
| 120 | +async function goToNextPublication(page) { |
| 121 | + try { |
| 122 | + // Find and click the "Next publication" button |
| 123 | + const nextButton = page.getByLabel("Next publication").getByRole("button"); |
| 124 | + const nextButtonExists = (await nextButton.count()) > 0; |
| 125 | + |
| 126 | + if (!nextButtonExists) { |
| 127 | + console.log(` ⚠️ Next publication button not found`); |
| 128 | + return false; |
| 129 | + } |
| 130 | + |
| 131 | + await nextButton.click(); |
| 132 | + |
| 133 | + // Wait for navigation to complete |
| 134 | + await page.waitForLoadState("networkidle", { timeout: TIMEOUT }); |
| 135 | + |
| 136 | + return true; |
| 137 | + } catch (error) { |
| 138 | + console.error(` ❌ Error navigating to next publication:`, error.message); |
| 139 | + return false; |
| 140 | + } |
| 141 | +} |
| 142 | + |
| 143 | +/** |
| 144 | + * Main function to download all PDFs |
| 145 | + */ |
| 146 | +async function main() { |
| 147 | + console.log("🚀 Starting PDF download TEST..."); |
| 148 | + console.log(` Range: Publication ${START_ID} to ${END_ID} (TEST)`); |
| 149 | + |
| 150 | + // Ensure the PDF directory exists |
| 151 | + await ensurePdfDirectory(); |
| 152 | + |
| 153 | + // Launch browser |
| 154 | + console.log("\n🌐 Launching browser..."); |
| 155 | + const browser = await chromium.launch({ |
| 156 | + headless: true, |
| 157 | + timeout: TIMEOUT, |
| 158 | + }); |
| 159 | + |
| 160 | + const context = await browser.newContext({ |
| 161 | + acceptDownloads: true, |
| 162 | + }); |
| 163 | + |
| 164 | + const page = await context.newPage(); |
| 165 | + |
| 166 | + // Track results |
| 167 | + const results = { |
| 168 | + successful: [], |
| 169 | + failed: [], |
| 170 | + skipped: [], |
| 171 | + }; |
| 172 | + |
| 173 | + // Start from the first publication |
| 174 | + let currentId = START_ID; |
| 175 | + |
| 176 | + try { |
| 177 | + while (currentId <= END_ID) { |
| 178 | + const result = await downloadPdf(page, currentId); |
| 179 | + |
| 180 | + if (result.success) { |
| 181 | + results.successful.push(result.id); |
| 182 | + } else { |
| 183 | + results.failed.push({ id: result.id, error: result.error }); |
| 184 | + } |
| 185 | + |
| 186 | + // If we haven't reached the end, try to navigate to the next publication |
| 187 | + if (currentId < END_ID) { |
| 188 | + console.log(` 📄 Moving to next publication...`); |
| 189 | + const navigated = await goToNextPublication(page); |
| 190 | + |
| 191 | + if (!navigated) { |
| 192 | + console.log( |
| 193 | + ` ⚠️ Could not navigate to next publication. Moving to next ID manually...` |
| 194 | + ); |
| 195 | + currentId++; |
| 196 | + } else { |
| 197 | + // Increment ID for next iteration |
| 198 | + currentId++; |
| 199 | + } |
| 200 | + |
| 201 | + // Small delay between downloads to be respectful to the server |
| 202 | + await page.waitForTimeout(1000); |
| 203 | + } else { |
| 204 | + break; |
| 205 | + } |
| 206 | + } |
| 207 | + } catch (error) { |
| 208 | + console.error("\n❌ Fatal error:", error); |
| 209 | + } finally { |
| 210 | + await browser.close(); |
| 211 | + } |
| 212 | + |
| 213 | + // Print summary |
| 214 | + console.log("\n" + "=".repeat(60)); |
| 215 | + console.log("📊 DOWNLOAD SUMMARY"); |
| 216 | + console.log("=".repeat(60)); |
| 217 | + console.log(`✅ Successful: ${results.successful.length}`); |
| 218 | + console.log(`❌ Failed: ${results.failed.length}`); |
| 219 | + |
| 220 | + if (results.failed.length > 0) { |
| 221 | + console.log("\n❌ Failed publications:"); |
| 222 | + results.failed.forEach(({ id, error }) => { |
| 223 | + console.log(` - Publication ${id}: ${error}`); |
| 224 | + }); |
| 225 | + } |
| 226 | + |
| 227 | + console.log("\n✨ Test completed!"); |
| 228 | +} |
| 229 | + |
| 230 | +// Run the script |
| 231 | +main().catch(console.error); |
0 commit comments