Skip to content

Commit 29984c7

Browse files
authored
Updated to correct version
1 parent 24ce697 commit 29984c7

File tree

1 file changed

+118
-97
lines changed

1 file changed

+118
-97
lines changed

easyscrape/pagination.py

Lines changed: 118 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,33 @@ def _find_next_link(
121121
For reliable pagination, use `paginate_param()` with the known
122122
parameter name, or provide a `next_selector` to `paginate()`.
123123
"""
124+
# Ensure HTML is loaded
125+
if not result.text:
126+
return None
127+
128+
# Try common CSS selectors first (faster and more reliable)
129+
common_selectors = [
130+
"a[rel='next']",
131+
"a[rel=next]",
132+
"li.next a",
133+
"a.next",
134+
".pagination .next a",
135+
".pager .next a",
136+
"a.pagination-next",
137+
]
138+
139+
for selector in common_selectors:
140+
try:
141+
links = result.extractor.css_list(selector, attr="href")
142+
if links and links[0]:
143+
href = links[0]
144+
if not href.startswith(("http://", "https://")):
145+
href = urljoin(result.final_url, href)
146+
return href
147+
except Exception:
148+
continue
149+
150+
# Fallback to regex pattern matching on anchor HTML
124151
default_patterns = [
125152
r'rel=["\']?next["\']?',
126153
r'class=["\'][^"\']*next[^"\']*["\']',
@@ -130,14 +157,11 @@ def _find_next_link(
130157
r'>&gt;<',
131158
r'>»<',
132159
r'>›<',
160+
r'>→<',
133161
]
134162

135163
search_patterns = patterns or default_patterns
136164

137-
# Ensure HTML is loaded
138-
if not result.text:
139-
return None
140-
141165
# Use selectolax parser for anchor extraction (10x faster than BS4)
142166
for anchor in result.extractor.parser.css("a[href]"):
143167
# Get the HTML representation of anchor for pattern matching
@@ -272,41 +296,41 @@ def found_target(result):
272296
current_url = start_url
273297
page_count = 0
274298

275-
with Session(cfg) as sess:
276-
while current_url and page_count < max_pages:
277-
# Normalise URL for duplicate detection
278-
normalised = current_url.rstrip("/").lower()
279-
if normalised in visited:
280-
break
281-
visited.add(normalised)
282-
283-
try:
284-
result = scrape(current_url, cfg, sess)
285-
except Exception:
286-
break
287-
288-
yield result
289-
page_count += 1
290-
291-
# Check stop condition
292-
if stop_if and stop_if(result):
299+
while current_url and page_count < max_pages:
300+
# Normalise URL for duplicate detection
301+
normalised = current_url.rstrip("/").lower()
302+
if normalised in visited:
303+
break
304+
visited.add(normalised)
305+
306+
try:
307+
result = scrape(current_url, cfg)
308+
except Exception:
309+
break
310+
311+
yield result
312+
page_count += 1
313+
314+
# Check stop condition
315+
if stop_if and stop_if(result):
316+
break
317+
318+
# Find next page URL
319+
if next_selector:
320+
next_links = result.extractor.css_list(next_selector, attr="href")
321+
if next_links:
322+
next_href = next_links[0]
323+
if not next_href.startswith(("http://", "https://")):
324+
next_href = urljoin(result.final_url, next_href)
325+
current_url = next_href
326+
else:
293327
break
294-
295-
# Find next page URL
296-
if next_selector:
297-
next_href = result.css(next_selector, "href")
298-
if next_href:
299-
if not next_href.startswith(("http://", "https://")):
300-
next_href = urljoin(result.final_url, next_href)
301-
current_url = next_href
302-
else:
303-
break
328+
else:
329+
next_url = _find_next_link(result, next_patterns)
330+
if next_url:
331+
current_url = next_url
304332
else:
305-
next_url = _find_next_link(result, next_patterns)
306-
if next_url:
307-
current_url = next_url
308-
else:
309-
break
333+
break
310334

311335

312336
def paginate_param(
@@ -365,24 +389,23 @@ def paginate_param(
365389
"""
366390
cfg = config or Config()
367391

368-
with Session(cfg) as sess:
369-
for page_num in range(start, end + 1):
370-
# Construct URL with page parameter
371-
parsed = urlparse(base_url)
372-
params = parse_qs(parsed.query, keep_blank_values=True)
373-
params[param] = [str(page_num)]
374-
new_query = urlencode(params, doseq=True)
375-
url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}?{new_query}"
376-
377-
try:
378-
result = scrape(url, cfg, sess)
379-
except Exception:
380-
break
392+
for page_num in range(start, end + 1):
393+
# Construct URL with page parameter
394+
parsed = urlparse(base_url)
395+
params = parse_qs(parsed.query, keep_blank_values=True)
396+
params[param] = [str(page_num)]
397+
new_query = urlencode(params, doseq=True)
398+
url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}?{new_query}"
381399

382-
yield result
400+
try:
401+
result = scrape(url, cfg)
402+
except Exception:
403+
break
383404

384-
if stop_if and stop_if(result):
385-
break
405+
yield result
406+
407+
if stop_if and stop_if(result):
408+
break
386409

387410

388411
def paginate_offset(
@@ -445,27 +468,26 @@ def is_empty(result):
445468
"""
446469
cfg = config or Config()
447470

448-
with Session(cfg) as sess:
449-
offset = start
450-
while offset <= max_offset:
451-
# Construct URL with offset parameter
452-
parsed = urlparse(base_url)
453-
params = parse_qs(parsed.query, keep_blank_values=True)
454-
params[param] = [str(offset)]
455-
new_query = urlencode(params, doseq=True)
456-
url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}?{new_query}"
457-
458-
try:
459-
result = scrape(url, cfg, sess)
460-
except Exception:
461-
break
471+
offset = start
472+
while offset <= max_offset:
473+
# Construct URL with offset parameter
474+
parsed = urlparse(base_url)
475+
params = parse_qs(parsed.query, keep_blank_values=True)
476+
params[param] = [str(offset)]
477+
new_query = urlencode(params, doseq=True)
478+
url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}?{new_query}"
462479

463-
yield result
480+
try:
481+
result = scrape(url, cfg)
482+
except Exception:
483+
break
464484

465-
if stop_if and stop_if(result):
466-
break
485+
yield result
486+
487+
if stop_if and stop_if(result):
488+
break
467489

468-
offset += step
490+
offset += step
469491

470492

471493
def crawl(
@@ -552,39 +574,38 @@ def crawl(
552574
queue: deque[str] = deque([start_url])
553575
page_count = 0
554576

555-
with Session(cfg) as sess:
556-
while queue and page_count < max_pages:
557-
url = queue.popleft()
577+
while queue and page_count < max_pages:
578+
url = queue.popleft()
558579

559-
# Normalise for duplicate detection
560-
normalised = url.rstrip("/").lower()
561-
if normalised in visited:
562-
continue
563-
visited.add(normalised)
580+
# Normalise for duplicate detection
581+
normalised = url.rstrip("/").lower()
582+
if normalised in visited:
583+
continue
584+
visited.add(normalised)
564585

565-
try:
566-
result = scrape(url, cfg, sess)
567-
except Exception:
568-
continue
586+
try:
587+
result = scrape(url, cfg)
588+
except Exception:
589+
continue
569590

570-
yield result
571-
page_count += 1
591+
yield result
592+
page_count += 1
572593

573-
if stop_if and stop_if(result):
574-
break
594+
if stop_if and stop_if(result):
595+
break
575596

576-
# Extract and filter new links
577-
new_links = result.links(link_pattern, absolute=True)
578-
for link in new_links:
579-
link_normalised = link.rstrip("/").lower()
580-
if link_normalised in visited:
581-
continue
597+
# Extract and filter new links
598+
new_links = result.links(link_pattern, absolute=True)
599+
for link in new_links:
600+
link_normalised = link.rstrip("/").lower()
601+
if link_normalised in visited:
602+
continue
582603

583-
# Domain filter
584-
if same_domain and urlparse(link).netloc != start_domain:
585-
continue
604+
# Domain filter
605+
if same_domain and urlparse(link).netloc != start_domain:
606+
continue
586607

587-
queue.append(link)
608+
queue.append(link)
588609

589610

590611
# These aliases improve API discoverability and match common naming expectations

0 commit comments

Comments
 (0)