@@ -121,6 +121,33 @@ def _find_next_link(
121121 For reliable pagination, use `paginate_param()` with the known
122122 parameter name, or provide a `next_selector` to `paginate()`.
123123 """
124+ # Ensure HTML is loaded
125+ if not result .text :
126+ return None
127+
128+ # Try common CSS selectors first (faster and more reliable)
129+ common_selectors = [
130+ "a[rel='next']" ,
131+ "a[rel=next]" ,
132+ "li.next a" ,
133+ "a.next" ,
134+ ".pagination .next a" ,
135+ ".pager .next a" ,
136+ "a.pagination-next" ,
137+ ]
138+
139+ for selector in common_selectors :
140+ try :
141+ links = result .extractor .css_list (selector , attr = "href" )
142+ if links and links [0 ]:
143+ href = links [0 ]
144+ if not href .startswith (("http://" , "https://" )):
145+ href = urljoin (result .final_url , href )
146+ return href
147+ except Exception :
148+ continue
149+
150+ # Fallback to regex pattern matching on anchor HTML
124151 default_patterns = [
125152 r'rel=["\']?next["\']?' ,
126153 r'class=["\'][^"\']*next[^"\']*["\']' ,
@@ -130,14 +157,11 @@ def _find_next_link(
130157 r'>><' ,
131158 r'>»<' ,
132159 r'>›<' ,
160+ r'>→<' ,
133161 ]
134162
135163 search_patterns = patterns or default_patterns
136164
137- # Ensure HTML is loaded
138- if not result .text :
139- return None
140-
141165 # Use selectolax parser for anchor extraction (10x faster than BS4)
142166 for anchor in result .extractor .parser .css ("a[href]" ):
143167 # Get the HTML representation of anchor for pattern matching
@@ -272,41 +296,41 @@ def found_target(result):
272296 current_url = start_url
273297 page_count = 0
274298
275- with Session (cfg ) as sess :
276- while current_url and page_count < max_pages :
277- # Normalise URL for duplicate detection
278- normalised = current_url .rstrip ("/" ).lower ()
279- if normalised in visited :
280- break
281- visited .add (normalised )
282-
283- try :
284- result = scrape (current_url , cfg , sess )
285- except Exception :
286- break
287-
288- yield result
289- page_count += 1
290-
291- # Check stop condition
292- if stop_if and stop_if (result ):
299+ while current_url and page_count < max_pages :
300+ # Normalise URL for duplicate detection
301+ normalised = current_url .rstrip ("/" ).lower ()
302+ if normalised in visited :
303+ break
304+ visited .add (normalised )
305+
306+ try :
307+ result = scrape (current_url , cfg )
308+ except Exception :
309+ break
310+
311+ yield result
312+ page_count += 1
313+
314+ # Check stop condition
315+ if stop_if and stop_if (result ):
316+ break
317+
318+ # Find next page URL
319+ if next_selector :
320+ next_links = result .extractor .css_list (next_selector , attr = "href" )
321+ if next_links :
322+ next_href = next_links [0 ]
323+ if not next_href .startswith (("http://" , "https://" )):
324+ next_href = urljoin (result .final_url , next_href )
325+ current_url = next_href
326+ else :
293327 break
294-
295- # Find next page URL
296- if next_selector :
297- next_href = result .css (next_selector , "href" )
298- if next_href :
299- if not next_href .startswith (("http://" , "https://" )):
300- next_href = urljoin (result .final_url , next_href )
301- current_url = next_href
302- else :
303- break
328+ else :
329+ next_url = _find_next_link (result , next_patterns )
330+ if next_url :
331+ current_url = next_url
304332 else :
305- next_url = _find_next_link (result , next_patterns )
306- if next_url :
307- current_url = next_url
308- else :
309- break
333+ break
310334
311335
312336def paginate_param (
@@ -365,24 +389,23 @@ def paginate_param(
365389 """
366390 cfg = config or Config ()
367391
368- with Session (cfg ) as sess :
369- for page_num in range (start , end + 1 ):
370- # Construct URL with page parameter
371- parsed = urlparse (base_url )
372- params = parse_qs (parsed .query , keep_blank_values = True )
373- params [param ] = [str (page_num )]
374- new_query = urlencode (params , doseq = True )
375- url = f"{ parsed .scheme } ://{ parsed .netloc } { parsed .path } ?{ new_query } "
376-
377- try :
378- result = scrape (url , cfg , sess )
379- except Exception :
380- break
392+ for page_num in range (start , end + 1 ):
393+ # Construct URL with page parameter
394+ parsed = urlparse (base_url )
395+ params = parse_qs (parsed .query , keep_blank_values = True )
396+ params [param ] = [str (page_num )]
397+ new_query = urlencode (params , doseq = True )
398+ url = f"{ parsed .scheme } ://{ parsed .netloc } { parsed .path } ?{ new_query } "
381399
382- yield result
400+ try :
401+ result = scrape (url , cfg )
402+ except Exception :
403+ break
383404
384- if stop_if and stop_if (result ):
385- break
405+ yield result
406+
407+ if stop_if and stop_if (result ):
408+ break
386409
387410
388411def paginate_offset (
@@ -445,27 +468,26 @@ def is_empty(result):
445468 """
446469 cfg = config or Config ()
447470
448- with Session (cfg ) as sess :
449- offset = start
450- while offset <= max_offset :
451- # Construct URL with offset parameter
452- parsed = urlparse (base_url )
453- params = parse_qs (parsed .query , keep_blank_values = True )
454- params [param ] = [str (offset )]
455- new_query = urlencode (params , doseq = True )
456- url = f"{ parsed .scheme } ://{ parsed .netloc } { parsed .path } ?{ new_query } "
457-
458- try :
459- result = scrape (url , cfg , sess )
460- except Exception :
461- break
471+ offset = start
472+ while offset <= max_offset :
473+ # Construct URL with offset parameter
474+ parsed = urlparse (base_url )
475+ params = parse_qs (parsed .query , keep_blank_values = True )
476+ params [param ] = [str (offset )]
477+ new_query = urlencode (params , doseq = True )
478+ url = f"{ parsed .scheme } ://{ parsed .netloc } { parsed .path } ?{ new_query } "
462479
463- yield result
480+ try :
481+ result = scrape (url , cfg )
482+ except Exception :
483+ break
464484
465- if stop_if and stop_if (result ):
466- break
485+ yield result
486+
487+ if stop_if and stop_if (result ):
488+ break
467489
468- offset += step
490+ offset += step
469491
470492
471493def crawl (
@@ -552,39 +574,38 @@ def crawl(
552574 queue : deque [str ] = deque ([start_url ])
553575 page_count = 0
554576
555- with Session (cfg ) as sess :
556- while queue and page_count < max_pages :
557- url = queue .popleft ()
577+ while queue and page_count < max_pages :
578+ url = queue .popleft ()
558579
559- # Normalise for duplicate detection
560- normalised = url .rstrip ("/" ).lower ()
561- if normalised in visited :
562- continue
563- visited .add (normalised )
580+ # Normalise for duplicate detection
581+ normalised = url .rstrip ("/" ).lower ()
582+ if normalised in visited :
583+ continue
584+ visited .add (normalised )
564585
565- try :
566- result = scrape (url , cfg , sess )
567- except Exception :
568- continue
586+ try :
587+ result = scrape (url , cfg )
588+ except Exception :
589+ continue
569590
570- yield result
571- page_count += 1
591+ yield result
592+ page_count += 1
572593
573- if stop_if and stop_if (result ):
574- break
594+ if stop_if and stop_if (result ):
595+ break
575596
576- # Extract and filter new links
577- new_links = result .links (link_pattern , absolute = True )
578- for link in new_links :
579- link_normalised = link .rstrip ("/" ).lower ()
580- if link_normalised in visited :
581- continue
597+ # Extract and filter new links
598+ new_links = result .links (link_pattern , absolute = True )
599+ for link in new_links :
600+ link_normalised = link .rstrip ("/" ).lower ()
601+ if link_normalised in visited :
602+ continue
582603
583- # Domain filter
584- if same_domain and urlparse (link ).netloc != start_domain :
585- continue
604+ # Domain filter
605+ if same_domain and urlparse (link ).netloc != start_domain :
606+ continue
586607
587- queue .append (link )
608+ queue .append (link )
588609
589610
590611# These aliases improve API discoverability and match common naming expectations
0 commit comments