Skip to content

Commit da9a16c

Browse files
committed
Fix scraper errors by implementing cloudscraper to bypass Cloudflare protection
Changes: - Replace requests with cloudscraper in all scraper modules (match.py, player.py, series.py, summary.py) - Add cloudscraper dependency to requirements.txt - Fix bug in player.py line 196: changed self.headers() to self.headers - Remove unused dateparser import from player.py - All modules now use cloudscraper.create_scraper() to handle Cloudflare anti-bot protection This update addresses 403 Access Denied errors caused by Cloudflare bot detection. The scraper will now work from environments where ESPN Cricinfo is accessible.
1 parent 1a52ac9 commit da9a16c

File tree

5 files changed

+22
-17
lines changed

5 files changed

+22
-17
lines changed

espncricinfo/match.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import json
2-
import requests
2+
import cloudscraper
33
from bs4 import BeautifulSoup
44
from espncricinfo.exceptions import MatchNotFoundError, NoScorecardError
55

@@ -10,6 +10,7 @@ def __init__(self, match_id):
1010
self.match_url = "https://www.espncricinfo.com/matches/engine/match/{0}.html".format(str(match_id))
1111
self.json_url = "https://www.espncricinfo.com/matches/engine/match/{0}.json".format(str(match_id))
1212
self.headers = {'user-agent': 'Mozilla/5.0'}
13+
self.scraper = cloudscraper.create_scraper()
1314
self.json = self.get_json()
1415
self.html = self.get_html()
1516
self.comms_json = self.get_comms_json()
@@ -90,7 +91,7 @@ def __repr__(self):
9091
return (f'{self.__class__.__name__}('f'{self.match_id!r})')
9192

9293
def get_json(self):
93-
r = requests.get(self.json_url,headers=self.headers)
94+
r = self.scraper.get(self.json_url, headers=self.headers)
9495
if r.status_code == 404:
9596
raise MatchNotFoundError
9697
elif 'Scorecard not yet available' in r.text:
@@ -99,7 +100,7 @@ def get_json(self):
99100
return r.json()
100101

101102
def get_html(self):
102-
r = requests.get(self.match_url,headers=self.headers)
103+
r = self.scraper.get(self.match_url, headers=self.headers)
103104
if r.status_code == 404:
104105
raise MatchNotFoundError
105106
else:
@@ -432,6 +433,7 @@ def get_recent_matches(date=None):
432433
url = "https://www.espncricinfo.com/ci/engine/match/index.html?date=%sview=week" % date
433434
else:
434435
url = "https://www.espncricinfo.com/ci/engine/match/index.html?view=week"
435-
r = requests.get(url,headers={'user-agent': 'Mozilla/5.0'})
436+
scraper = cloudscraper.create_scraper()
437+
r = scraper.get(url, headers={'user-agent': 'Mozilla/5.0'})
436438
soup = BeautifulSoup(r.text, 'html.parser')
437439
return [x['href'].split('/',4)[4].split('.')[0] for x in soup.findAll('a', href=True, text='Scorecard')]

espncricinfo/player.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
1-
import requests
1+
import cloudscraper
22
from bs4 import BeautifulSoup
3-
import dateparser
43
from espncricinfo.exceptions import PlayerNotFoundError
54
from espncricinfo.match import Match
65
import csv
@@ -13,6 +12,7 @@ def __init__(self, player_id):
1312
self.json_url = "http://core.espnuk.org/v2/sports/cricket/athletes/{0}".format(str(player_id))
1413
self.new_json_url = "https://hs-consumer-api.espncricinfo.com/v1/pages/player/home?playerId={0}".format(str(player_id))
1514
self.headers = {'user-agent': 'Mozilla/5.0'}
15+
self.scraper = cloudscraper.create_scraper()
1616
self.parsed_html = self.get_html()
1717
self.json = self.get_json()
1818
self.new_json = self.get_new_json()
@@ -29,21 +29,21 @@ def __init__(self, player_id):
2929
self.major_teams = self._major_teams()
3030

3131
def get_html(self):
32-
r = requests.get(self.url, headers=self.headers)
32+
r = self.scraper.get(self.url, headers=self.headers)
3333
if r.status_code == 404:
3434
raise PlayerNotFoundError
3535
else:
3636
return BeautifulSoup(r.text, 'html.parser')
3737

3838
def get_json(self):
39-
r = requests.get(self.json_url, headers=self.headers)
39+
r = self.scraper.get(self.json_url, headers=self.headers)
4040
if r.status_code == 404:
4141
raise PlayerNotFoundError
4242
else:
4343
return r.json()
44-
44+
4545
def get_new_json(self):
46-
r = requests.get(self.new_json_url, headers=self.headers)
46+
r = self.scraper.get(self.new_json_url, headers=self.headers)
4747
if r.status_code == 404:
4848
raise PlayerNotFoundError
4949
else:
@@ -127,7 +127,7 @@ def get_career_averages(self, file_name=None, match_format=11, data_type='allrou
127127
self.file_name = f"{self.player_id}_{self.match_format}_{self.data_type}_career_averages.csv"
128128

129129
self.url=f"https://stats.espncricinfo.com/ci/engine/player/{self.player_id}.html?class={self.match_format};template=results;type={self.data_type}"
130-
html_doc = requests.get(self.url, headers=self.headers)
130+
html_doc = self.scraper.get(self.url, headers=self.headers)
131131
soup = BeautifulSoup(html_doc.text, 'html.parser')
132132
tables = soup.find_all("table")[2]
133133
table_rows = tables.find_all("tr")
@@ -159,7 +159,7 @@ def get_career_summary(self, file_name=None, match_format=11, data_type='allroun
159159
self.file_name = f"{self.player_id}_{self.match_format}_{self.data_type}_career_summary.csv"
160160

161161
self.url=f"https://stats.espncricinfo.com/ci/engine/player/{self.player_id}.html?class={self.match_format};template=results;type={self.data_type}"
162-
html_doc = requests.get(self.url, headers=self.headers)
162+
html_doc = self.scraper.get(self.url, headers=self.headers)
163163
soup = BeautifulSoup(html_doc.text, 'html.parser')
164164
tables = soup.find_all("table")[3]
165165
table_rows = tables.find_all("tr")
@@ -193,7 +193,7 @@ def get_data(self, file_name=None, match_format=11, data_type='allround', view='
193193
self.file_name = f"{self.player_id}_{self.match_format}_{self.data_type}_{self.view}.csv"
194194

195195
self.url=f"https://stats.espncricinfo.com/ci/engine/player/{self.player_id}.html?class={self.match_format};template=results;type={self.data_type};view={self.view}"
196-
html_doc = requests.get(self.url, headers=self.headers())
196+
html_doc = self.scraper.get(self.url, headers=self.headers)
197197
soup = BeautifulSoup(html_doc.text, 'html.parser')
198198
tables = soup.find_all("table")[3]
199199
table_rows = tables.find_all("tr")

espncricinfo/series.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import requests
1+
import cloudscraper
22
from bs4 import BeautifulSoup
33
from espncricinfo.exceptions import MatchNotFoundError, NoSeriesError
44

@@ -10,6 +10,7 @@ def __init__(self, series_id):
1010
self.events_url = "http://core.espnuk.org/v2/sports/cricket/leagues/{0}/events".format(str(series_id))
1111
self.seasons_url = "http://core.espnuk.org/v2/sports/cricket/leagues/{0}/seasons".format(str(series_id))
1212
self.headers = {'user-agent': 'Mozilla/5.0'}
13+
self.scraper = cloudscraper.create_scraper()
1314
self.json = self.get_json(self.json_url)
1415
self.seasons = self._get_seasons()
1516
self.years = self._get_years_from_seasons()
@@ -26,7 +27,7 @@ def __init__(self, series_id):
2627
self.events = self._build_events()
2728

2829
def get_json(self, url):
29-
r = requests.get(url,headers=self.headers)
30+
r = self.scraper.get(url, headers=self.headers)
3031
if r.status_code == 404:
3132
raise "Not Found"
3233
else:

espncricinfo/summary.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import requests
1+
import cloudscraper
22
from bs4 import BeautifulSoup
33
from espncricinfo.match import Match
44

@@ -7,12 +7,13 @@ class Summary(object):
77
def __init__(self):
88
self.url = "http://static.cricinfo.com/rss/livescores.xml"
99
self.headers = {'user-agent': 'Mozilla/5.0'}
10+
self.scraper = cloudscraper.create_scraper()
1011
self.xml = self.get_xml()
1112
self.match_ids = self._match_ids()
1213
self.matches = self._build_matches()
1314

1415
def get_xml(self):
15-
r = requests.get(self.url, headers=self.headers)
16+
r = self.scraper.get(self.url, headers=self.headers)
1617
if r.status_code == 404:
1718
raise MatchNotFoundError
1819
else:

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
beautifulsoup4==4.9.1
2+
cloudscraper>=1.2.71
23
dateparser==1.1.6
34
jdatetime==3.6.2
45
python-dateutil==2.8.1

0 commit comments

Comments
 (0)