@@ -2,172 +2,237 @@ import axios from "axios";
22import * as cheerio from "cheerio" ;
33import { HttpProxyAgent } from "http-proxy-agent" ;
44import { HttpsProxyAgent } from "https-proxy-agent" ;
5+ import puppeteer from "puppeteer" ;
56
67const USER_AGENTS = [
78 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" ,
89 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" ,
910 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36" ,
1011 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36" ,
12+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36" ,
13+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15" ,
14+ "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:127.0) Gecko/20100101 Firefox/127.0"
1115] ;
1216
1317/**
1418 * Fetches a URL with a retry mechanism, proxy support, and rotating user-agents.
19+ * Includes a Puppeteer fallback if initial Axios fetch fails or returns incomplete data.
1520 * @param {string } url - The URL to fetch.
16- * @param {number } [retries=3] - The number of times to retry on failure.
21+ * @param {number } [retries=3] - The number of times to retry on Axios failure.
1722 * @param {number } [timeout=60000] - The timeout for each request in milliseconds.
1823 * @returns {Promise<string> } - The HTML data from the URL.
1924 */
2025async function fetchWithRetry ( url , retries = 3 , timeout = 60000 ) {
21- // Increased default timeout
22- const proxyUrl = process . env . PROXY_URL ;
23- const httpAgent = proxyUrl ? new HttpProxyAgent ( proxyUrl ) : undefined ;
24- const httpsAgent = proxyUrl ? new HttpsProxyAgent ( proxyUrl ) : undefined ;
25-
26- const randomUserAgent =
27- USER_AGENTS [ Math . floor ( Math . random ( ) * USER_AGENTS . length ) ] ;
28-
29- const axiosInstance = axios . create ( {
30- timeout,
31- headers : {
32- "User-Agent" : randomUserAgent ,
33- Accept :
34- "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9" ,
35- "Accept-Language" : "en-US,en;q=0.9" ,
36- "Accept-Encoding" : "gzip, deflate" ,
37- } ,
38- httpAgent,
39- httpsAgent,
40- } ) ;
41-
42- for ( let attempt = 1 ; attempt <= retries ; attempt ++ ) {
43- try {
44- console . log (
45- `[Attempt ${ attempt } ] Fetching ${ url } ` + ( proxyUrl ? " via proxy" : "" )
46- ) ;
47- const { data } = await axiosInstance . get ( url ) ;
48- return data ;
49- } catch ( err ) {
50- if ( err . code === "ECONNABORTED" && attempt < retries ) {
51- const waitTime = 5000 * attempt ;
52- console . warn (
53- `[Retry ${ attempt } ] Timeout fetching ${ url } , retrying in ${
54- waitTime / 1000
55- } s...`
26+ const proxyUrl = process . env . PROXY_URL ;
27+ const randomUserAgent = USER_AGENTS [ Math . floor ( Math . random ( ) * USER_AGENTS . length ) ] ;
28+
29+ for ( let attempt = 1 ; attempt <= retries ; attempt ++ ) {
30+ try {
31+ console . log (
32+ `[Axios Attempt ${ attempt } /${ retries } ] Fetching ${ url } ` +
33+ ( proxyUrl ? ` via proxy: ${ proxyUrl } ` : " directly" )
5634 ) ;
57- await new Promise ( ( r ) => setTimeout ( r , waitTime ) ) ;
58- } else {
35+
36+ const axiosInstance = axios . create ( {
37+ timeout,
38+ headers : {
39+ "User-Agent" : randomUserAgent ,
40+ Accept :
41+ "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9" ,
42+ "Accept-Language" : "en-US,en;q=0.9" ,
43+ "Accept-Encoding" : "gzip, deflate" ,
44+ "Connection" : "keep-alive"
45+ } ,
46+ httpAgent : proxyUrl ? new HttpProxyAgent ( proxyUrl ) : undefined ,
47+ httpsAgent : proxyUrl ? new HttpsProxyAgent ( proxyUrl ) : undefined ,
48+ } ) ;
49+
50+ const { data } = await axiosInstance . get ( url ) ;
51+
52+ if ( ! data || data . length < 500 ) {
53+ console . warn ( `[Axios] Data for ${ url } is too small or empty (${ data ? data . length : 0 } bytes). Triggering retry or Puppeteer fallback.` ) ;
54+ throw new Error ( "Incomplete or empty data from Axios." ) ;
55+ }
56+ console . log ( `[Axios Success] Fetched ${ url } with ${ data . length } bytes.` ) ;
57+ return data ;
58+ } catch ( err ) {
5959 console . error (
60- `[Scraper] Failed to fetch ${ url } on attempt ${ attempt } . Error:` ,
61- err . message
60+ `[Axios Attempt ${ attempt } ] Failed to fetch ${ url } . Error: ${ err . message } `
6261 ) ;
63- if ( attempt >= retries ) {
64- throw err ;
62+ if ( err . response ) {
63+ console . error ( `[Axios] HTTP Status: ${ err . response . status } , Response Data (first 200 chars): ${ String ( err . response . data ) . substring ( 0 , 200 ) } ` ) ;
64+ } else if ( err . request ) {
65+ console . error ( `[Axios] No response received. Request made but no data.` ) ;
66+ } else {
67+ console . error ( `[Axios] Error setting up request: ${ err . message } ` ) ;
68+ }
69+
70+ if ( attempt < retries ) {
71+ const waitTime = 5000 * attempt ;
72+ console . warn ( `Retrying Axios in ${ waitTime / 1000 } s...` ) ;
73+ await new Promise ( ( r ) => setTimeout ( r , waitTime ) ) ;
74+ }
75+ }
76+ }
77+
78+ console . log ( `[Puppeteer Fallback] Axios failed after ${ retries } attempts for ${ url } . Launching Puppeteer...` ) ;
79+ let browser ;
80+ try {
81+ browser = await puppeteer . launch ( {
82+ headless : "new" ,
83+ args : [
84+ '--no-sandbox' ,
85+ '--disable-setuid-sandbox' ,
86+ '--disable-gpu' ,
87+ '--disable-dev-shm-usage' ,
88+ '--single-process' ,
89+ ...( proxyUrl ? [ `--proxy-server=${ proxyUrl } ` ] : [ ] ) ,
90+ ] ,
91+ ignoreHTTPSErrors : true ,
92+ } ) ;
93+
94+ const page = await browser . newPage ( ) ;
95+ await page . setUserAgent ( randomUserAgent ) ;
96+
97+ page . on ( 'console' , msg => console . log ( `[Browser Console] ${ msg . text ( ) } ` ) ) ;
98+ page . on ( 'pageerror' , err => console . error ( `[Browser Page Error] ${ err . message } ` ) ) ;
99+ page . on ( 'requestfailed' , request => console . error ( `[Browser Request Failed] ${ request . url ( ) } ${ request . failure ( ) . errorText } ` ) ) ;
100+
101+ await page . goto ( url , { waitUntil : 'networkidle0' , timeout : timeout * 3 } ) ;
102+
103+ const htmlContent = await page . content ( ) ;
104+ console . log ( `[Puppeteer Success] Fetched ${ url } with Puppeteer. Content length: ${ htmlContent . length } bytes.` ) ;
105+ return htmlContent ;
106+ } catch ( puppeteerErr ) {
107+ console . error ( `[Puppeteer Fallback] Failed to fetch ${ url } with Puppeteer. Error: ${ puppeteerErr . message } ` ) ;
108+ throw new Error ( `Failed to fetch ${ url } after Axios retries and Puppeteer fallback: ${ puppeteerErr . message } ` ) ;
109+ } finally {
110+ if ( browser ) {
111+ await browser . close ( ) ;
65112 }
66- }
67113 }
68- }
69- throw new Error ( `Failed to fetch ${ url } after ${ retries } attempts.` ) ;
70114}
71115
72116/**
73117 * Scrapes the latest notices from the IOE Examination Control Division website.
74118 * @returns {Promise<Array<object>> } - A list of notice objects.
75119 */
76120export async function scrapeIoeExamNotice ( ) {
77- const url = "http://exam.ioe.edu.np/" ;
78- try {
79- const data = await fetchWithRetry ( url ) ;
80- const $ = cheerio . load ( data ) ;
81- const notices = [ ] ;
82-
83- $ ( "#datatable tbody tr" ) . each ( ( _ , el ) => {
84- const row = $ ( el ) ;
85- const titleElement = row . find ( "td:nth-child(2) a" ) ;
86- const dateElement = row . find ( "td:nth-child(3)" ) ;
87- const viewLinkElement = row . find (
88- 'td:nth-child(4) a[href*="/Notice/Index/"]'
89- ) ;
90- const downloadLinkElement = row . find (
91- 'td:nth-child(4) a[target="_blank"]'
92- ) ;
93-
94- if (
95- titleElement . length &&
96- dateElement . length &&
97- viewLinkElement . length &&
98- downloadLinkElement . length
99- ) {
100- const title = titleElement . text ( ) . trim ( ) ;
101- const date = dateElement . text ( ) . trim ( ) ;
102- const noticePageLink = new URL ( viewLinkElement . attr ( "href" ) , url ) . href ;
103- const pdfLink = new URL ( downloadLinkElement . attr ( "href" ) , url ) . href ;
104-
105- notices . push ( {
106- title,
107- link : noticePageLink ,
108- attachments : [ pdfLink ] ,
109- date,
110- source : "IOE Exam Section" ,
121+ const url = "http://exam.ioe.edu.np/" ;
122+ try {
123+ const data = await fetchWithRetry ( url ) ;
124+ const $ = cheerio . load ( data ) ;
125+ const notices = [ ] ;
126+
127+ $ ( "#datatable tbody tr" ) . each ( ( _ , el ) => {
128+ const row = $ ( el ) ;
129+ const titleElement = row . find ( "td:nth-child(2) a" ) ;
130+ const dateElement = row . find ( "td:nth-child(3)" ) ;
131+ const viewLinkElement = row . find (
132+ 'td:nth-child(4) a[href*="/Notice/Index/"]'
133+ ) ;
134+ const downloadLinkElement = row . find (
135+ 'td:nth-child(4) a[target="_blank"]'
136+ ) ;
137+
138+ if (
139+ titleElement . length &&
140+ dateElement . length &&
141+ viewLinkElement . length &&
142+ downloadLinkElement . length
143+ ) {
144+ const title = titleElement . text ( ) . trim ( ) ;
145+ const date = dateElement . text ( ) . trim ( ) ;
146+ const noticePageLink = new URL ( viewLinkElement . attr ( "href" ) , url ) . href ;
147+ const pdfLink = new URL ( downloadLinkElement . attr ( "href" ) , url ) . href ;
148+
149+ notices . push ( {
150+ title,
151+ link : noticePageLink ,
152+ attachments : [ pdfLink ] ,
153+ date,
154+ source : "IOE Exam Section" ,
155+ } ) ;
156+ }
111157 } ) ;
112- }
113- } ) ;
114158
115- return notices ;
116- } catch ( err ) {
117- return [ ] ;
118- }
159+ return notices ;
160+ } catch ( err ) {
161+ console . error ( "[scrapeIoeExamNotice] Error during scraping or parsing:" , err . message ) ;
162+ return [ ] ;
163+ }
119164}
120165
121166/**
122167 * Scrapes the latest notice from the Pulchowk Campus website.
123168 * @returns {Promise<object|null> } - A single notice object or null on failure.
124169 */
125170export async function scrapePcampusNotice ( ) {
126- const listUrl = "https://pcampus.edu.np/category/general-notices/" ;
127- try {
128- const listData = await fetchWithRetry ( listUrl ) ;
129- const $list = cheerio . load ( listData ) ;
130- const latestArticle = $list ( "article" ) . first ( ) ;
131-
132- const title = latestArticle . find ( "h2.entry-title a" ) . text ( ) . trim ( ) ;
133- const pageLink = latestArticle . find ( "h2.entry-title a" ) . attr ( "href" ) ;
134- const date = latestArticle . find ( "time.entry-date" ) . attr ( "datetime" ) ;
135- const postId = latestArticle . attr ( "id" ) ;
136-
137- if ( ! pageLink ) throw new Error ( "No page link found in latest article." ) ;
138-
139- const pageData = await fetchWithRetry ( pageLink ) ;
140- const $page = cheerio . load ( pageData ) ;
141- const attachments = [ ] ;
142-
143- $page ( ".entry-content a" ) . each ( ( _ , el ) => {
144- const href = $page ( el ) . attr ( "href" ) ;
145- if ( href ?. includes ( "/wp-content/uploads/" ) ) {
146- attachments . push ( new URL ( href , pageLink ) . href ) ;
147- }
148- } ) ;
149-
150- return {
151- id : postId ,
152- title,
153- link : pageLink ,
154- attachments : [ ...new Set ( attachments ) ] ,
155- date,
156- source : "Pulchowk Campus" ,
157- } ;
158- } catch ( err ) {
159- return null ;
160- }
171+ const listUrl = "https://pcampus.edu.np/category/general-notices/" ;
172+ try {
173+ const listData = await fetchWithRetry ( listUrl ) ;
174+ const $list = cheerio . load ( listData ) ;
175+ const latestArticle = $list ( "article" ) . first ( ) ;
176+
177+ const title = latestArticle . find ( "h2.entry-title a" ) . text ( ) . trim ( ) ;
178+ const pageLink = latestArticle . find ( "h2.entry-title a" ) . attr ( "href" ) ;
179+ const date = latestArticle . find ( "time.entry-date" ) . attr ( "datetime" ) ;
180+ const postId = latestArticle . attr ( "id" ) ;
181+
182+ if ( ! pageLink ) {
183+ console . warn ( "[scrapePcampusNotice] No page link found in latest article." ) ;
184+ return null ;
185+ }
186+
187+ const pageData = await fetchWithRetry ( pageLink ) ;
188+ const $page = cheerio . load ( pageData ) ;
189+ const attachments = [ ] ;
190+
191+ $page ( ".entry-content a" ) . each ( ( _ , el ) => {
192+ const href = $page ( el ) . attr ( "href" ) ;
193+ if ( href ?. includes ( "/wp-content/uploads/" ) ) {
194+ attachments . push ( new URL ( href , pageLink ) . href ) ;
195+ }
196+ } ) ;
197+
198+ return {
199+ id : postId ,
200+ title,
201+ link : pageLink ,
202+ attachments : [ ...new Set ( attachments ) ] ,
203+ date,
204+ source : "Pulchowk Campus" ,
205+ } ;
206+ } catch ( err ) {
207+ console . error ( "[scrapePcampusNotice] Error during scraping or parsing:" , err . message ) ;
208+ return null ;
209+ }
161210}
162211
163212/**
164213 * Scrapes and combines the latest notices from all sources.
165214 * @returns {Promise<Array<object>> } - A combined list of notice objects.
166215 */
167216export async function scrapeLatestNotice ( ) {
168- const [ ioe , pcampus ] = await Promise . all ( [
169- scrapeIoeExamNotice ( ) ,
170- scrapePcampusNotice ( ) ,
171- ] ) ;
172- return [ ...( ioe || [ ] ) , ...( pcampus ? [ pcampus ] : [ ] ) ] ;
173- }
217+ console . log ( "Scraping latest notices from all sources..." ) ;
218+ const [ ioe , pcampus ] = await Promise . allSettled ( [
219+ scrapeIoeExamNotice ( ) ,
220+ scrapePcampusNotice ( ) ,
221+ ] ) ;
222+
223+ let combinedNotices = [ ] ;
224+
225+ if ( ioe . status === 'fulfilled' && ioe . value ) {
226+ combinedNotices = [ ...combinedNotices , ...ioe . value ] ;
227+ } else {
228+ console . error ( "[scrapeLatestNotice] IOE Exam Notice scraping failed:" , ioe . reason ) ;
229+ }
230+
231+ if ( pcampus . status === 'fulfilled' && pcampus . value ) {
232+ combinedNotices = [ ...combinedNotices , pcampus . value ] ;
233+ } else {
234+ console . error ( "[scrapeLatestNotice] Pulchowk Campus Notice scraping failed:" , pcampus . reason ) ;
235+ }
236+
237+ return combinedNotices ;
238+ }
0 commit comments