Skip to content

Commit d095616

Browse files
committed
fix: remove chromium and headerless curling from scraper.js and Dockerfile
1 parent 531c6a2 commit d095616

File tree

2 files changed

+55
-91
lines changed

2 files changed

+55
-91
lines changed

Dockerfile

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ RUN apt-get update && \
2828
libxdamage1 \
2929
libxrandr2 \
3030
xdg-utils \
31-
chromium \
3231
graphicsmagick \
3332
ghostscript \
3433
&& rm -rf /var/lib/apt/lists/* \

src/services/scraper.js

Lines changed: 55 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@ import axios from "axios";
22
import * as cheerio from "cheerio";
33
import { HttpProxyAgent } from "http-proxy-agent";
44
import { HttpsProxyAgent } from "https-proxy-agent";
5-
import puppeteer from "puppeteer";
6-
// Import the new logging system
75
import { log } from '../utils/debug.js';
86

97
const USER_AGENTS = [
@@ -18,7 +16,6 @@ const USER_AGENTS = [
1816

1917
/**
2018
* Fetches a URL with a retry mechanism, proxy support, and rotating user-agents.
21-
* Includes a Puppeteer fallback if initial Axios fetch fails or returns incomplete data.
2219
* @param {string} url - The URL to fetch.
2320
* @param {number} [retries=3] - The number of times to retry on Axios failure.
2421
* @param {number} [timeout=60000] - The timeout for each request in milliseconds.
@@ -30,7 +27,6 @@ export async function fetchWithRetry(url, retries = 3, timeout = 60000) {
3027

3128
for (let attempt = 1; attempt <= retries; attempt++) {
3229
try {
33-
// Use the new log function
3430
log(
3531
`[Axios Attempt ${attempt}/${retries}] Fetching ${url}` +
3632
(proxyUrl ? ` via proxy: ${proxyUrl}` : " directly"), 'info'
@@ -53,15 +49,12 @@ export async function fetchWithRetry(url, retries = 3, timeout = 60000) {
5349
const { data } = await axiosInstance.get(url);
5450

5551
if (!data || data.length < 500) {
56-
// Use the new log function
57-
log(`[Axios] Data for ${url} is too small or empty (${data ? data.length : 0} bytes). Triggering retry or Puppeteer fallback.`, 'warn');
52+
log(`[Axios] Data for ${url} is too small or empty (${data ? data.length : 0} bytes). Triggering retry.`, 'warn');
5853
throw new Error("Incomplete or empty data from Axios.");
5954
}
60-
// Use the new log function
6155
log(`[Axios Success] Fetched ${url} with ${data.length} bytes.`, 'info');
6256
return data;
6357
} catch (err) {
64-
// Use the new log function
6558
log(
6659
`[Axios Attempt ${attempt}] Failed to fetch ${url}. Error: ${err.message}`, 'error'
6760
);
@@ -73,45 +66,18 @@ export async function fetchWithRetry(url, retries = 3, timeout = 60000) {
7366
log(`[Axios] Error setting up request: ${err.message}`, 'error');
7467
}
7568
if (attempt === retries) {
76-
log(`[Axios] All attempts failed. Initiating Puppeteer fallback.`, 'warn');
77-
break;
69+
log(`[Axios] All attempts failed. Throwing final error.`, 'error');
70+
throw new Error(`Failed to fetch ${url} after multiple attempts.`);
7871
}
7972
}
8073
}
81-
82-
try {
83-
// Use the new log function
84-
log(`[Puppeteer] Starting Puppeteer fallback for ${url}`, 'info');
85-
86-
const browser = await puppeteer.launch({
87-
args: ["--no-sandbox", "--disable-setuid-sandbox"],
88-
headless: "new"
89-
});
90-
const page = await browser.newPage();
91-
await page.setUserAgent(randomUserAgent);
92-
await page.goto(url, { waitUntil: "networkidle2", timeout });
93-
const htmlContent = await page.content();
94-
await browser.close();
95-
96-
if (!htmlContent || htmlContent.length < 500) {
97-
log(`[Puppeteer] Data for ${url} is too small or empty (${htmlContent ? htmlContent.length : 0} bytes).`, 'warn');
98-
throw new Error("Incomplete or empty data from Puppeteer.");
99-
}
100-
101-
log(`[Puppeteer Success] Fetched ${url} with ${htmlContent.length} bytes.`, 'info');
102-
return htmlContent;
103-
} catch (err) {
104-
log(`[Puppeteer] Failed to fetch ${url} via Puppeteer: ${err.message}`, 'error');
105-
throw new Error(`Failed to fetch ${url} after multiple attempts.`);
106-
}
10774
}
10875

10976
/**
11077
* Scrapes IOE Exam notices from the official website.
11178
* @returns {Promise<Array<object>>} - A list of notice objects.
11279
*/
11380
export async function scrapeIoeExamNotice() {
114-
// Reverted URL and selectors to the previous working version
11581
const url = "http://exam.ioe.edu.np/";
11682
log(`[scrapeIoeExamNotice] Scraping ${url}`, 'info');
11783
try {
@@ -120,30 +86,30 @@ export async function scrapeIoeExamNotice() {
12086
const notices = [];
12187

12288
$("#datatable tbody tr").each((_, el) => {
123-
const row = $(el);
124-
const titleElement = row.find("td:nth-child(2) a");
125-
const dateElement = row.find("td:nth-child(3)");
126-
const viewLinkElement = row.find(
127-
'td:nth-child(4) a[href*="/Notice/Index/"]'
128-
);
129-
const downloadLinkElement = row.find(
130-
'td:nth-child(4) a[target="_blank"]'
131-
);
132-
133-
if (titleElement.length && dateElement.length && viewLinkElement.length && downloadLinkElement.length) {
134-
const title = titleElement.text().trim();
135-
const date = dateElement.text().trim();
136-
const noticePageLink = new URL(viewLinkElement.attr("href"), url).href;
137-
const pdfLink = new URL(downloadLinkElement.attr("href"), url).href;
138-
139-
notices.push({
140-
title,
141-
link: noticePageLink,
142-
attachments: [pdfLink],
143-
date,
144-
source: "IOE Exam Section",
145-
});
146-
}
89+
const row = $(el);
90+
const titleElement = row.find("td:nth-child(2) a");
91+
const dateElement = row.find("td:nth-child(3)");
92+
const viewLinkElement = row.find(
93+
'td:nth-child(4) a[href*="/Notice/Index/"]'
94+
);
95+
const downloadLinkElement = row.find(
96+
'td:nth-child(4) a[target="_blank"]'
97+
);
98+
99+
if (titleElement.length && dateElement.length && viewLinkElement.length && downloadLinkElement.length) {
100+
const title = titleElement.text().trim();
101+
const date = dateElement.text().trim();
102+
const noticePageLink = new URL(viewLinkElement.attr("href"), url).href;
103+
const pdfLink = new URL(downloadLinkElement.attr("href"), url).href;
104+
105+
notices.push({
106+
title,
107+
link: noticePageLink,
108+
attachments: [pdfLink],
109+
date,
110+
source: "IOE Exam Section",
111+
});
112+
}
147113
});
148114
log(`[scrapeIoeExamNotice] Scraped ${notices.length} notices.`, 'info');
149115
return notices;
@@ -159,51 +125,50 @@ export async function scrapeIoeExamNotice() {
159125
* @returns {Promise<Array<object>>} - A list of notice objects.
160126
*/
161127
export async function scrapePcampusNotice() {
162-
// Reverted URL and selectors to the previous working version
163128
const listUrl = "https://pcampus.edu.np/";
164129
log(`[scrapePcampusNotice] Scraping ${listUrl}`, 'info');
165130

166131
try {
167132
const listData = await fetchWithRetry(listUrl);
168133
const $list = cheerio.load(listData);
169-
const noticeItems = $list("#recent-posts-2 ul li");
134+
const noticeItems = $list("#recent-posts-2 ul li");
170135
if (noticeItems.length === 0) {
171-
log("[scrapePcampusNotice] Could not find any notices in the widget.", 'warn');
172-
return [];
136+
log("[scrapePcampusNotice] Could not find any notices in the widget.", 'warn');
137+
return [];
173138
}
174139
const noticeDetailPromises = [];
175140
noticeItems.each((_, el) => {
176-
const item = $list(el);
177-
const titleElement = item.find("a");
178-
const pageLink = titleElement.attr("href");
179-
const title = titleElement.text().trim();
180-
const date = item.find(".post-date").text().trim();
181-
if (pageLink) {
182-
const detailPromise = (async () => {
183-
try {
184-
const pageData = await fetchWithRetry(pageLink);
185-
const $page = cheerio.load(pageData);
186-
const attachments = [];
187-
$page(".entry-content a").each((_, a) => {
188-
const href = $page(a).attr("href");
189-
if (href?.includes("/wp-content/uploads/")) {
190-
attachments.push(new URL(href, pageLink).href);
191-
}
192-
});
193-
return { title, link: pageLink, attachments: [...new Set(attachments)], date, source: "Pulchowk Campus" };
194-
} catch (err) {
195-
log(`[scrapePcampusNotice] Failed to fetch details for ${pageLink}. Error: ${err.message}`, 'error');
196-
return null;
197-
}
198-
})();
199-
noticeDetailPromises.push(detailPromise);
141+
const item = $list(el);
142+
const titleElement = item.find("a");
143+
const pageLink = titleElement.attr("href");
144+
const title = titleElement.text().trim();
145+
const date = item.find(".post-date").text().trim();
146+
if (pageLink) {
147+
const detailPromise = (async () => {
148+
try {
149+
const pageData = await fetchWithRetry(pageLink);
150+
const $page = cheerio.load(pageData);
151+
const attachments = [];
152+
$page(".entry-content a").each((_, a) => {
153+
const href = $page(a).attr("href");
154+
if (href?.includes("/wp-content/uploads/")) {
155+
attachments.push(new URL(href, pageLink).href);
156+
}
157+
});
158+
return { title, link: pageLink, attachments: [...new Set(attachments)], date, source: "Pulchowk Campus" };
159+
} catch (err) {
160+
log(`[scrapePcampusNotice] Failed to fetch details for ${pageLink}. Error: ${err.message}`, 'error');
161+
return null;
200162
}
163+
})();
164+
noticeDetailPromises.push(detailPromise);
165+
}
201166
});
202167
const results = await Promise.all(noticeDetailPromises);
203168
return results.filter(notice => notice !== null);
204169
} catch (err) {
205170
log("[scrapePcampusNotice] Error during scraping or parsing:", 'error', null, err, 'error');
206-
return [];
171+
return [];
207172
}
208173
}
209174

0 commit comments

Comments
 (0)