Skip to content

Commit ce82c23

Browse files
author
fsupulchowk
committed
refactor: improving scraper.js for IOE exam notice and enhancement of code
1 parent 2455799 commit ce82c23

File tree

2 files changed

+193
-138
lines changed

2 files changed

+193
-138
lines changed

Dockerfile

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,7 @@
11
FROM node:20-slim
2-
32
RUN groupadd -r botuser && useradd -r -g botuser -d /app -s /bin/bash botuser
4-
53
WORKDIR /app
64
RUN chown -R botuser:botuser /app
7-
85
RUN apt-get update && \
96
apt-get install -y --no-install-recommends \
107
wget \
@@ -28,19 +25,12 @@ RUN apt-get update && \
2825
xdg-utils \
2926
chromium \
3027
graphicsmagick \
31-
&& rm -rf /var/lib/apt/lists/* && \
32-
apt-get clean
33-
34-
# Set Chromium path
28+
&& rm -rf /var/lib/apt/lists/* \
29+
&& apt-get clean
3530
ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium
36-
3731
COPY --chown=botuser:botuser package*.json ./
38-
3932
USER botuser
40-
4133
RUN npm ci --omit=dev && \
4234
npm cache clean --force
43-
4435
COPY --chown=botuser:botuser . .
45-
4636
CMD ["node", "./src/bot.js"]

src/services/scraper.js

Lines changed: 191 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -2,172 +2,237 @@ import axios from "axios";
22
import * as cheerio from "cheerio";
33
import { HttpProxyAgent } from "http-proxy-agent";
44
import { HttpsProxyAgent } from "https-proxy-agent";
5+
import puppeteer from "puppeteer";
56

67
const USER_AGENTS = [
78
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
89
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
910
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
1011
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
12+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
13+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15",
14+
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:127.0) Gecko/20100101 Firefox/127.0"
1115
];
1216

1317
/**
1418
* Fetches a URL with a retry mechanism, proxy support, and rotating user-agents.
19+
* Includes a Puppeteer fallback if initial Axios fetch fails or returns incomplete data.
1520
* @param {string} url - The URL to fetch.
16-
* @param {number} [retries=3] - The number of times to retry on failure.
21+
* @param {number} [retries=3] - The number of times to retry on Axios failure.
1722
* @param {number} [timeout=60000] - The timeout for each request in milliseconds.
1823
* @returns {Promise<string>} - The HTML data from the URL.
1924
*/
2025
async function fetchWithRetry(url, retries = 3, timeout = 60000) {
21-
// Increased default timeout
22-
const proxyUrl = process.env.PROXY_URL;
23-
const httpAgent = proxyUrl ? new HttpProxyAgent(proxyUrl) : undefined;
24-
const httpsAgent = proxyUrl ? new HttpsProxyAgent(proxyUrl) : undefined;
25-
26-
const randomUserAgent =
27-
USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
28-
29-
const axiosInstance = axios.create({
30-
timeout,
31-
headers: {
32-
"User-Agent": randomUserAgent,
33-
Accept:
34-
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
35-
"Accept-Language": "en-US,en;q=0.9",
36-
"Accept-Encoding": "gzip, deflate",
37-
},
38-
httpAgent,
39-
httpsAgent,
40-
});
41-
42-
for (let attempt = 1; attempt <= retries; attempt++) {
43-
try {
44-
console.log(
45-
`[Attempt ${attempt}] Fetching ${url}` + (proxyUrl ? " via proxy" : "")
46-
);
47-
const { data } = await axiosInstance.get(url);
48-
return data;
49-
} catch (err) {
50-
if (err.code === "ECONNABORTED" && attempt < retries) {
51-
const waitTime = 5000 * attempt;
52-
console.warn(
53-
`[Retry ${attempt}] Timeout fetching ${url}, retrying in ${
54-
waitTime / 1000
55-
}s...`
26+
const proxyUrl = process.env.PROXY_URL;
27+
const randomUserAgent = USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
28+
29+
for (let attempt = 1; attempt <= retries; attempt++) {
30+
try {
31+
console.log(
32+
`[Axios Attempt ${attempt}/${retries}] Fetching ${url}` +
33+
(proxyUrl ? ` via proxy: ${proxyUrl}` : " directly")
5634
);
57-
await new Promise((r) => setTimeout(r, waitTime));
58-
} else {
35+
36+
const axiosInstance = axios.create({
37+
timeout,
38+
headers: {
39+
"User-Agent": randomUserAgent,
40+
Accept:
41+
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
42+
"Accept-Language": "en-US,en;q=0.9",
43+
"Accept-Encoding": "gzip, deflate",
44+
"Connection": "keep-alive"
45+
},
46+
httpAgent: proxyUrl ? new HttpProxyAgent(proxyUrl) : undefined,
47+
httpsAgent: proxyUrl ? new HttpsProxyAgent(proxyUrl) : undefined,
48+
});
49+
50+
const { data } = await axiosInstance.get(url);
51+
52+
if (!data || data.length < 500) {
53+
console.warn(`[Axios] Data for ${url} is too small or empty (${data ? data.length : 0} bytes). Triggering retry or Puppeteer fallback.`);
54+
throw new Error("Incomplete or empty data from Axios.");
55+
}
56+
console.log(`[Axios Success] Fetched ${url} with ${data.length} bytes.`);
57+
return data;
58+
} catch (err) {
5959
console.error(
60-
`[Scraper] Failed to fetch ${url} on attempt ${attempt}. Error:`,
61-
err.message
60+
`[Axios Attempt ${attempt}] Failed to fetch ${url}. Error: ${err.message}`
6261
);
63-
if (attempt >= retries) {
64-
throw err;
62+
if (err.response) {
63+
console.error(`[Axios] HTTP Status: ${err.response.status}, Response Data (first 200 chars): ${String(err.response.data).substring(0, 200)}`);
64+
} else if (err.request) {
65+
console.error(`[Axios] No response received. Request made but no data.`);
66+
} else {
67+
console.error(`[Axios] Error setting up request: ${err.message}`);
68+
}
69+
70+
if (attempt < retries) {
71+
const waitTime = 5000 * attempt;
72+
console.warn(`Retrying Axios in ${waitTime / 1000}s...`);
73+
await new Promise((r) => setTimeout(r, waitTime));
74+
}
75+
}
76+
}
77+
78+
console.log(`[Puppeteer Fallback] Axios failed after ${retries} attempts for ${url}. Launching Puppeteer...`);
79+
let browser;
80+
try {
81+
browser = await puppeteer.launch({
82+
headless: "new",
83+
args: [
84+
'--no-sandbox',
85+
'--disable-setuid-sandbox',
86+
'--disable-gpu',
87+
'--disable-dev-shm-usage',
88+
'--single-process',
89+
...(proxyUrl ? [`--proxy-server=${proxyUrl}`] : []),
90+
],
91+
ignoreHTTPSErrors: true,
92+
});
93+
94+
const page = await browser.newPage();
95+
await page.setUserAgent(randomUserAgent);
96+
97+
page.on('console', msg => console.log(`[Browser Console] ${msg.text()}`));
98+
page.on('pageerror', err => console.error(`[Browser Page Error] ${err.message}`));
99+
page.on('requestfailed', request => console.error(`[Browser Request Failed] ${request.url()} ${request.failure().errorText}`));
100+
101+
await page.goto(url, { waitUntil: 'networkidle0', timeout: timeout * 3 });
102+
103+
const htmlContent = await page.content();
104+
console.log(`[Puppeteer Success] Fetched ${url} with Puppeteer. Content length: ${htmlContent.length} bytes.`);
105+
return htmlContent;
106+
} catch (puppeteerErr) {
107+
console.error(`[Puppeteer Fallback] Failed to fetch ${url} with Puppeteer. Error: ${puppeteerErr.message}`);
108+
throw new Error(`Failed to fetch ${url} after Axios retries and Puppeteer fallback: ${puppeteerErr.message}`);
109+
} finally {
110+
if (browser) {
111+
await browser.close();
65112
}
66-
}
67113
}
68-
}
69-
throw new Error(`Failed to fetch ${url} after ${retries} attempts.`);
70114
}
71115

72116
/**
73117
* Scrapes the latest notices from the IOE Examination Control Division website.
74118
* @returns {Promise<Array<object>>} - A list of notice objects.
75119
*/
76120
export async function scrapeIoeExamNotice() {
77-
const url = "http://exam.ioe.edu.np/";
78-
try {
79-
const data = await fetchWithRetry(url);
80-
const $ = cheerio.load(data);
81-
const notices = [];
82-
83-
$("#datatable tbody tr").each((_, el) => {
84-
const row = $(el);
85-
const titleElement = row.find("td:nth-child(2) a");
86-
const dateElement = row.find("td:nth-child(3)");
87-
const viewLinkElement = row.find(
88-
'td:nth-child(4) a[href*="/Notice/Index/"]'
89-
);
90-
const downloadLinkElement = row.find(
91-
'td:nth-child(4) a[target="_blank"]'
92-
);
93-
94-
if (
95-
titleElement.length &&
96-
dateElement.length &&
97-
viewLinkElement.length &&
98-
downloadLinkElement.length
99-
) {
100-
const title = titleElement.text().trim();
101-
const date = dateElement.text().trim();
102-
const noticePageLink = new URL(viewLinkElement.attr("href"), url).href;
103-
const pdfLink = new URL(downloadLinkElement.attr("href"), url).href;
104-
105-
notices.push({
106-
title,
107-
link: noticePageLink,
108-
attachments: [pdfLink],
109-
date,
110-
source: "IOE Exam Section",
121+
const url = "http://exam.ioe.edu.np/";
122+
try {
123+
const data = await fetchWithRetry(url);
124+
const $ = cheerio.load(data);
125+
const notices = [];
126+
127+
$("#datatable tbody tr").each((_, el) => {
128+
const row = $(el);
129+
const titleElement = row.find("td:nth-child(2) a");
130+
const dateElement = row.find("td:nth-child(3)");
131+
const viewLinkElement = row.find(
132+
'td:nth-child(4) a[href*="/Notice/Index/"]'
133+
);
134+
const downloadLinkElement = row.find(
135+
'td:nth-child(4) a[target="_blank"]'
136+
);
137+
138+
if (
139+
titleElement.length &&
140+
dateElement.length &&
141+
viewLinkElement.length &&
142+
downloadLinkElement.length
143+
) {
144+
const title = titleElement.text().trim();
145+
const date = dateElement.text().trim();
146+
const noticePageLink = new URL(viewLinkElement.attr("href"), url).href;
147+
const pdfLink = new URL(downloadLinkElement.attr("href"), url).href;
148+
149+
notices.push({
150+
title,
151+
link: noticePageLink,
152+
attachments: [pdfLink],
153+
date,
154+
source: "IOE Exam Section",
155+
});
156+
}
111157
});
112-
}
113-
});
114158

115-
return notices;
116-
} catch (err) {
117-
return [];
118-
}
159+
return notices;
160+
} catch (err) {
161+
console.error("[scrapeIoeExamNotice] Error during scraping or parsing:", err.message);
162+
return [];
163+
}
119164
}
120165

121166
/**
122167
* Scrapes the latest notice from the Pulchowk Campus website.
123168
* @returns {Promise<object|null>} - A single notice object or null on failure.
124169
*/
125170
export async function scrapePcampusNotice() {
126-
const listUrl = "https://pcampus.edu.np/category/general-notices/";
127-
try {
128-
const listData = await fetchWithRetry(listUrl);
129-
const $list = cheerio.load(listData);
130-
const latestArticle = $list("article").first();
131-
132-
const title = latestArticle.find("h2.entry-title a").text().trim();
133-
const pageLink = latestArticle.find("h2.entry-title a").attr("href");
134-
const date = latestArticle.find("time.entry-date").attr("datetime");
135-
const postId = latestArticle.attr("id");
136-
137-
if (!pageLink) throw new Error("No page link found in latest article.");
138-
139-
const pageData = await fetchWithRetry(pageLink);
140-
const $page = cheerio.load(pageData);
141-
const attachments = [];
142-
143-
$page(".entry-content a").each((_, el) => {
144-
const href = $page(el).attr("href");
145-
if (href?.includes("/wp-content/uploads/")) {
146-
attachments.push(new URL(href, pageLink).href);
147-
}
148-
});
149-
150-
return {
151-
id: postId,
152-
title,
153-
link: pageLink,
154-
attachments: [...new Set(attachments)],
155-
date,
156-
source: "Pulchowk Campus",
157-
};
158-
} catch (err) {
159-
return null;
160-
}
171+
const listUrl = "https://pcampus.edu.np/category/general-notices/";
172+
try {
173+
const listData = await fetchWithRetry(listUrl);
174+
const $list = cheerio.load(listData);
175+
const latestArticle = $list("article").first();
176+
177+
const title = latestArticle.find("h2.entry-title a").text().trim();
178+
const pageLink = latestArticle.find("h2.entry-title a").attr("href");
179+
const date = latestArticle.find("time.entry-date").attr("datetime");
180+
const postId = latestArticle.attr("id");
181+
182+
if (!pageLink) {
183+
console.warn("[scrapePcampusNotice] No page link found in latest article.");
184+
return null;
185+
}
186+
187+
const pageData = await fetchWithRetry(pageLink);
188+
const $page = cheerio.load(pageData);
189+
const attachments = [];
190+
191+
$page(".entry-content a").each((_, el) => {
192+
const href = $page(el).attr("href");
193+
if (href?.includes("/wp-content/uploads/")) {
194+
attachments.push(new URL(href, pageLink).href);
195+
}
196+
});
197+
198+
return {
199+
id: postId,
200+
title,
201+
link: pageLink,
202+
attachments: [...new Set(attachments)],
203+
date,
204+
source: "Pulchowk Campus",
205+
};
206+
} catch (err) {
207+
console.error("[scrapePcampusNotice] Error during scraping or parsing:", err.message);
208+
return null;
209+
}
161210
}
162211

163212
/**
164213
* Scrapes and combines the latest notices from all sources.
165214
* @returns {Promise<Array<object>>} - A combined list of notice objects.
166215
*/
167216
export async function scrapeLatestNotice() {
168-
const [ioe, pcampus] = await Promise.all([
169-
scrapeIoeExamNotice(),
170-
scrapePcampusNotice(),
171-
]);
172-
return [...(ioe || []), ...(pcampus ? [pcampus] : [])];
173-
}
217+
console.log("Scraping latest notices from all sources...");
218+
const [ioe, pcampus] = await Promise.allSettled([
219+
scrapeIoeExamNotice(),
220+
scrapePcampusNotice(),
221+
]);
222+
223+
let combinedNotices = [];
224+
225+
if (ioe.status === 'fulfilled' && ioe.value) {
226+
combinedNotices = [...combinedNotices, ...ioe.value];
227+
} else {
228+
console.error("[scrapeLatestNotice] IOE Exam Notice scraping failed:", ioe.reason);
229+
}
230+
231+
if (pcampus.status === 'fulfilled' && pcampus.value) {
232+
combinedNotices = [...combinedNotices, pcampus.value];
233+
} else {
234+
console.error("[scrapeLatestNotice] Pulchowk Campus Notice scraping failed:", pcampus.reason);
235+
}
236+
237+
return combinedNotices;
238+
}

0 commit comments

Comments
 (0)