@@ -2,8 +2,6 @@ import axios from "axios";
22import * as cheerio from "cheerio" ;
33import { HttpProxyAgent } from "http-proxy-agent" ;
44import { HttpsProxyAgent } from "https-proxy-agent" ;
5- import puppeteer from "puppeteer" ;
6- // Import the new logging system
75import { log } from '../utils/debug.js' ;
86
97const USER_AGENTS = [
@@ -18,7 +16,6 @@ const USER_AGENTS = [
1816
1917/**
2018 * Fetches a URL with a retry mechanism, proxy support, and rotating user-agents.
21- * Includes a Puppeteer fallback if initial Axios fetch fails or returns incomplete data.
2219 * @param {string } url - The URL to fetch.
2320 * @param {number } [retries=3] - The number of times to retry on Axios failure.
2421 * @param {number } [timeout=60000] - The timeout for each request in milliseconds.
@@ -30,7 +27,6 @@ export async function fetchWithRetry(url, retries = 3, timeout = 60000) {
3027
3128 for ( let attempt = 1 ; attempt <= retries ; attempt ++ ) {
3229 try {
33- // Use the new log function
3430 log (
3531 `[Axios Attempt ${ attempt } /${ retries } ] Fetching ${ url } ` +
3632 ( proxyUrl ? ` via proxy: ${ proxyUrl } ` : " directly" ) , 'info'
@@ -53,15 +49,12 @@ export async function fetchWithRetry(url, retries = 3, timeout = 60000) {
5349 const { data } = await axiosInstance . get ( url ) ;
5450
5551 if ( ! data || data . length < 500 ) {
56- // Use the new log function
57- log ( `[Axios] Data for ${ url } is too small or empty (${ data ? data . length : 0 } bytes). Triggering retry or Puppeteer fallback.` , 'warn' ) ;
52+ log ( `[Axios] Data for ${ url } is too small or empty (${ data ? data . length : 0 } bytes). Triggering retry.` , 'warn' ) ;
5853 throw new Error ( "Incomplete or empty data from Axios." ) ;
5954 }
60- // Use the new log function
6155 log ( `[Axios Success] Fetched ${ url } with ${ data . length } bytes.` , 'info' ) ;
6256 return data ;
6357 } catch ( err ) {
64- // Use the new log function
6558 log (
6659 `[Axios Attempt ${ attempt } ] Failed to fetch ${ url } . Error: ${ err . message } ` , 'error'
6760 ) ;
@@ -73,45 +66,18 @@ export async function fetchWithRetry(url, retries = 3, timeout = 60000) {
7366 log ( `[Axios] Error setting up request: ${ err . message } ` , 'error' ) ;
7467 }
7568 if ( attempt === retries ) {
76- log ( `[Axios] All attempts failed. Initiating Puppeteer fallback .` , 'warn ' ) ;
77- break ;
69+ log ( `[Axios] All attempts failed. Throwing final error .` , 'error ' ) ;
70+ throw new Error ( `Failed to fetch ${ url } after multiple attempts.` ) ;
7871 }
7972 }
8073 }
81-
82- try {
83- // Use the new log function
84- log ( `[Puppeteer] Starting Puppeteer fallback for ${ url } ` , 'info' ) ;
85-
86- const browser = await puppeteer . launch ( {
87- args : [ "--no-sandbox" , "--disable-setuid-sandbox" ] ,
88- headless : "new"
89- } ) ;
90- const page = await browser . newPage ( ) ;
91- await page . setUserAgent ( randomUserAgent ) ;
92- await page . goto ( url , { waitUntil : "networkidle2" , timeout } ) ;
93- const htmlContent = await page . content ( ) ;
94- await browser . close ( ) ;
95-
96- if ( ! htmlContent || htmlContent . length < 500 ) {
97- log ( `[Puppeteer] Data for ${ url } is too small or empty (${ htmlContent ? htmlContent . length : 0 } bytes).` , 'warn' ) ;
98- throw new Error ( "Incomplete or empty data from Puppeteer." ) ;
99- }
100-
101- log ( `[Puppeteer Success] Fetched ${ url } with ${ htmlContent . length } bytes.` , 'info' ) ;
102- return htmlContent ;
103- } catch ( err ) {
104- log ( `[Puppeteer] Failed to fetch ${ url } via Puppeteer: ${ err . message } ` , 'error' ) ;
105- throw new Error ( `Failed to fetch ${ url } after multiple attempts.` ) ;
106- }
10774}
10875
10976/**
11077 * Scrapes IOE Exam notices from the official website.
11178 * @returns {Promise<Array<object>> } - A list of notice objects.
11279 */
11380export async function scrapeIoeExamNotice ( ) {
114- // Reverted URL and selectors to the previous working version
11581 const url = "http://exam.ioe.edu.np/" ;
11682 log ( `[scrapeIoeExamNotice] Scraping ${ url } ` , 'info' ) ;
11783 try {
@@ -120,30 +86,30 @@ export async function scrapeIoeExamNotice() {
12086 const notices = [ ] ;
12187
12288 $ ( "#datatable tbody tr" ) . each ( ( _ , el ) => {
123- const row = $ ( el ) ;
124- const titleElement = row . find ( "td:nth-child(2) a" ) ;
125- const dateElement = row . find ( "td:nth-child(3)" ) ;
126- const viewLinkElement = row . find (
127- 'td:nth-child(4) a[href*="/Notice/Index/"]'
128- ) ;
129- const downloadLinkElement = row . find (
130- 'td:nth-child(4) a[target="_blank"]'
131- ) ;
132-
133- if ( titleElement . length && dateElement . length && viewLinkElement . length && downloadLinkElement . length ) {
134- const title = titleElement . text ( ) . trim ( ) ;
135- const date = dateElement . text ( ) . trim ( ) ;
136- const noticePageLink = new URL ( viewLinkElement . attr ( "href" ) , url ) . href ;
137- const pdfLink = new URL ( downloadLinkElement . attr ( "href" ) , url ) . href ;
138-
139- notices . push ( {
140- title,
141- link : noticePageLink ,
142- attachments : [ pdfLink ] ,
143- date,
144- source : "IOE Exam Section" ,
145- } ) ;
146- }
89+ const row = $ ( el ) ;
90+ const titleElement = row . find ( "td:nth-child(2) a" ) ;
91+ const dateElement = row . find ( "td:nth-child(3)" ) ;
92+ const viewLinkElement = row . find (
93+ 'td:nth-child(4) a[href*="/Notice/Index/"]'
94+ ) ;
95+ const downloadLinkElement = row . find (
96+ 'td:nth-child(4) a[target="_blank"]'
97+ ) ;
98+
99+ if ( titleElement . length && dateElement . length && viewLinkElement . length && downloadLinkElement . length ) {
100+ const title = titleElement . text ( ) . trim ( ) ;
101+ const date = dateElement . text ( ) . trim ( ) ;
102+ const noticePageLink = new URL ( viewLinkElement . attr ( "href" ) , url ) . href ;
103+ const pdfLink = new URL ( downloadLinkElement . attr ( "href" ) , url ) . href ;
104+
105+ notices . push ( {
106+ title,
107+ link : noticePageLink ,
108+ attachments : [ pdfLink ] ,
109+ date,
110+ source : "IOE Exam Section" ,
111+ } ) ;
112+ }
147113 } ) ;
148114 log ( `[scrapeIoeExamNotice] Scraped ${ notices . length } notices.` , 'info' ) ;
149115 return notices ;
@@ -159,51 +125,50 @@ export async function scrapeIoeExamNotice() {
159125 * @returns {Promise<Array<object>> } - A list of notice objects.
160126 */
161127export async function scrapePcampusNotice ( ) {
162- // Reverted URL and selectors to the previous working version
163128 const listUrl = "https://pcampus.edu.np/" ;
164129 log ( `[scrapePcampusNotice] Scraping ${ listUrl } ` , 'info' ) ;
165130
166131 try {
167132 const listData = await fetchWithRetry ( listUrl ) ;
168133 const $list = cheerio . load ( listData ) ;
169- const noticeItems = $list ( "#recent-posts-2 ul li" ) ;
134+ const noticeItems = $list ( "#recent-posts-2 ul li" ) ;
170135 if ( noticeItems . length === 0 ) {
171- log ( "[scrapePcampusNotice] Could not find any notices in the widget." , 'warn' ) ;
172- return [ ] ;
136+ log ( "[scrapePcampusNotice] Could not find any notices in the widget." , 'warn' ) ;
137+ return [ ] ;
173138 }
174139 const noticeDetailPromises = [ ] ;
175140 noticeItems . each ( ( _ , el ) => {
176- const item = $list ( el ) ;
177- const titleElement = item . find ( "a" ) ;
178- const pageLink = titleElement . attr ( "href" ) ;
179- const title = titleElement . text ( ) . trim ( ) ;
180- const date = item . find ( ".post-date" ) . text ( ) . trim ( ) ;
181- if ( pageLink ) {
182- const detailPromise = ( async ( ) => {
183- try {
184- const pageData = await fetchWithRetry ( pageLink ) ;
185- const $page = cheerio . load ( pageData ) ;
186- const attachments = [ ] ;
187- $page ( ".entry-content a" ) . each ( ( _ , a ) => {
188- const href = $page ( a ) . attr ( "href" ) ;
189- if ( href ?. includes ( "/wp-content/uploads/" ) ) {
190- attachments . push ( new URL ( href , pageLink ) . href ) ;
191- }
192- } ) ;
193- return { title, link : pageLink , attachments : [ ...new Set ( attachments ) ] , date, source : "Pulchowk Campus" } ;
194- } catch ( err ) {
195- log ( `[scrapePcampusNotice] Failed to fetch details for ${ pageLink } . Error: ${ err . message } ` , 'error' ) ;
196- return null ;
197- }
198- } ) ( ) ;
199- noticeDetailPromises . push ( detailPromise ) ;
141+ const item = $list ( el ) ;
142+ const titleElement = item . find ( "a" ) ;
143+ const pageLink = titleElement . attr ( "href" ) ;
144+ const title = titleElement . text ( ) . trim ( ) ;
145+ const date = item . find ( ".post-date" ) . text ( ) . trim ( ) ;
146+ if ( pageLink ) {
147+ const detailPromise = ( async ( ) => {
148+ try {
149+ const pageData = await fetchWithRetry ( pageLink ) ;
150+ const $page = cheerio . load ( pageData ) ;
151+ const attachments = [ ] ;
152+ $page ( ".entry-content a" ) . each ( ( _ , a ) => {
153+ const href = $page ( a ) . attr ( "href" ) ;
154+ if ( href ?. includes ( "/wp-content/uploads/" ) ) {
155+ attachments . push ( new URL ( href , pageLink ) . href ) ;
156+ }
157+ } ) ;
158+ return { title, link : pageLink , attachments : [ ...new Set ( attachments ) ] , date, source : "Pulchowk Campus" } ;
159+ } catch ( err ) {
160+ log ( `[scrapePcampusNotice] Failed to fetch details for ${ pageLink } . Error: ${ err . message } ` , 'error' ) ;
161+ return null ;
200162 }
163+ } ) ( ) ;
164+ noticeDetailPromises . push ( detailPromise ) ;
165+ }
201166 } ) ;
202167 const results = await Promise . all ( noticeDetailPromises ) ;
203168 return results . filter ( notice => notice !== null ) ;
204169 } catch ( err ) {
205170 log ( "[scrapePcampusNotice] Error during scraping or parsing:" , 'error' , null , err , 'error' ) ;
206- return [ ] ;
171+ return [ ] ;
207172 }
208173}
209174
0 commit comments