Fix scraper errors by implementing cloudscraper to bypass Cloudflare protection

claude · claude · commit da9a16c85220 · 2025-11-21T22:00:24.000Z
Changes:
- Replace requests with cloudscraper in all scraper modules (match.py, player.py, series.py, summary.py)
- Add cloudscraper dependency to requirements.txt
- Fix bug in player.py line 196: changed self.headers() to self.headers
- Remove unused dateparser import from player.py
- All modules now use cloudscraper.create_scraper() to handle Cloudflare anti-bot protection

This update addresses 403 Access Denied errors caused by Cloudflare bot detection.
The scraper will now work from environments where ESPN Cricinfo is accessible.
diff --git a/espncricinfo/match.py b/espncricinfo/match.py
@@ -1,5 +1,5 @@
 import json
-import requests
+import cloudscraper
 from bs4 import BeautifulSoup
 from espncricinfo.exceptions import MatchNotFoundError, NoScorecardError
 
@@ -10,6 +10,7 @@ def __init__(self, match_id):
         self.match_url = "https://www.espncricinfo.com/matches/engine/match/{0}.html".format(str(match_id))
         self.json_url = "https://www.espncricinfo.com/matches/engine/match/{0}.json".format(str(match_id))
         self.headers = {'user-agent': 'Mozilla/5.0'}
+        self.scraper = cloudscraper.create_scraper()
         self.json = self.get_json()
         self.html = self.get_html()
         self.comms_json = self.get_comms_json()
@@ -90,7 +91,7 @@ def __repr__(self):
         return (f'{self.__class__.__name__}('f'{self.match_id!r})')
 
     def get_json(self):
-        r = requests.get(self.json_url,headers=self.headers)
+        r = self.scraper.get(self.json_url, headers=self.headers)
         if r.status_code == 404:
             raise MatchNotFoundError
         elif 'Scorecard not yet available' in r.text:
@@ -99,7 +100,7 @@ def get_json(self):
             return r.json()
 
     def get_html(self):
-        r = requests.get(self.match_url,headers=self.headers)
+        r = self.scraper.get(self.match_url, headers=self.headers)
         if r.status_code == 404:
             raise MatchNotFoundError
         else:
@@ -432,6 +433,7 @@ def get_recent_matches(date=None):
             url = "https://www.espncricinfo.com/ci/engine/match/index.html?date=%sview=week" % date
         else:
             url = "https://www.espncricinfo.com/ci/engine/match/index.html?view=week"
-        r = requests.get(url,headers={'user-agent': 'Mozilla/5.0'})
+        scraper = cloudscraper.create_scraper()
+        r = scraper.get(url, headers={'user-agent': 'Mozilla/5.0'})
         soup = BeautifulSoup(r.text, 'html.parser')
         return [x['href'].split('/',4)[4].split('.')[0] for x in soup.findAll('a', href=True, text='Scorecard')]
diff --git a/espncricinfo/player.py b/espncricinfo/player.py
@@ -1,6 +1,5 @@
-import requests
+import cloudscraper
 from bs4 import BeautifulSoup
-import dateparser
 from espncricinfo.exceptions import PlayerNotFoundError
 from espncricinfo.match import Match
 import csv
@@ -13,6 +12,7 @@ def __init__(self, player_id):
         self.json_url = "http://core.espnuk.org/v2/sports/cricket/athletes/{0}".format(str(player_id))
         self.new_json_url = "https://hs-consumer-api.espncricinfo.com/v1/pages/player/home?playerId={0}".format(str(player_id))
         self.headers = {'user-agent': 'Mozilla/5.0'}
+        self.scraper = cloudscraper.create_scraper()
         self.parsed_html = self.get_html() 
         self.json = self.get_json()       
         self.new_json = self.get_new_json()
@@ -29,21 +29,21 @@ def __init__(self, player_id):
         self.major_teams = self._major_teams()
 
     def get_html(self):
-        r = requests.get(self.url, headers=self.headers)
+        r = self.scraper.get(self.url, headers=self.headers)
         if r.status_code == 404:
             raise PlayerNotFoundError
         else:
             return BeautifulSoup(r.text, 'html.parser')
 
     def get_json(self):
-        r = requests.get(self.json_url, headers=self.headers)
+        r = self.scraper.get(self.json_url, headers=self.headers)
         if r.status_code == 404:
             raise PlayerNotFoundError
         else:
             return r.json()
-        
+
     def get_new_json(self):
-        r = requests.get(self.new_json_url, headers=self.headers)
+        r = self.scraper.get(self.new_json_url, headers=self.headers)
         if r.status_code == 404:
             raise PlayerNotFoundError
         else:
@@ -127,7 +127,7 @@ def get_career_averages(self, file_name=None, match_format=11, data_type='allrou
             self.file_name = f"{self.player_id}_{self.match_format}_{self.data_type}_career_averages.csv"
 
         self.url=f"https://stats.espncricinfo.com/ci/engine/player/{self.player_id}.html?class={self.match_format};template=results;type={self.data_type}"
-        html_doc = requests.get(self.url, headers=self.headers)
+        html_doc = self.scraper.get(self.url, headers=self.headers)
         soup = BeautifulSoup(html_doc.text, 'html.parser')
         tables = soup.find_all("table")[2]
         table_rows = tables.find_all("tr")
@@ -159,7 +159,7 @@ def get_career_summary(self, file_name=None, match_format=11, data_type='allroun
             self.file_name = f"{self.player_id}_{self.match_format}_{self.data_type}_career_summary.csv"
 
         self.url=f"https://stats.espncricinfo.com/ci/engine/player/{self.player_id}.html?class={self.match_format};template=results;type={self.data_type}"
-        html_doc = requests.get(self.url, headers=self.headers)
+        html_doc = self.scraper.get(self.url, headers=self.headers)
         soup = BeautifulSoup(html_doc.text, 'html.parser')
         tables = soup.find_all("table")[3]
         table_rows = tables.find_all("tr")
@@ -193,7 +193,7 @@ def get_data(self, file_name=None, match_format=11, data_type='allround', view='
             self.file_name = f"{self.player_id}_{self.match_format}_{self.data_type}_{self.view}.csv"
 
         self.url=f"https://stats.espncricinfo.com/ci/engine/player/{self.player_id}.html?class={self.match_format};template=results;type={self.data_type};view={self.view}"
-        html_doc = requests.get(self.url, headers=self.headers())
+        html_doc = self.scraper.get(self.url, headers=self.headers)
         soup = BeautifulSoup(html_doc.text, 'html.parser')
         tables = soup.find_all("table")[3]
         table_rows = tables.find_all("tr")
diff --git a/espncricinfo/series.py b/espncricinfo/series.py
@@ -1,4 +1,4 @@
-import requests
+import cloudscraper
 from bs4 import BeautifulSoup
 from espncricinfo.exceptions import MatchNotFoundError, NoSeriesError
 
@@ -10,6 +10,7 @@ def __init__(self, series_id):
         self.events_url = "http://core.espnuk.org/v2/sports/cricket/leagues/{0}/events".format(str(series_id))
         self.seasons_url = "http://core.espnuk.org/v2/sports/cricket/leagues/{0}/seasons".format(str(series_id))
         self.headers = {'user-agent': 'Mozilla/5.0'}
+        self.scraper = cloudscraper.create_scraper()
         self.json = self.get_json(self.json_url)
         self.seasons = self._get_seasons()
         self.years = self._get_years_from_seasons()
@@ -26,7 +27,7 @@ def __init__(self, series_id):
             self.events = self._build_events()
 
     def get_json(self, url):
-        r = requests.get(url,headers=self.headers)
+        r = self.scraper.get(url, headers=self.headers)
         if r.status_code == 404:
             raise "Not Found"
         else:
diff --git a/espncricinfo/summary.py b/espncricinfo/summary.py
@@ -1,4 +1,4 @@
-import requests
+import cloudscraper
 from bs4 import BeautifulSoup
 from espncricinfo.match import Match
 
@@ -7,12 +7,13 @@ class Summary(object):
     def __init__(self):
         self.url = "http://static.cricinfo.com/rss/livescores.xml"
         self.headers = {'user-agent': 'Mozilla/5.0'}
+        self.scraper = cloudscraper.create_scraper()
         self.xml = self.get_xml()
         self.match_ids = self._match_ids()
         self.matches = self._build_matches()
 
     def get_xml(self):
-        r = requests.get(self.url, headers=self.headers)
+        r = self.scraper.get(self.url, headers=self.headers)
         if r.status_code == 404:
             raise MatchNotFoundError
         else:
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,5 @@
 beautifulsoup4==4.9.1
+cloudscraper>=1.2.71
 dateparser==1.1.6
 jdatetime==3.6.2
 python-dateutil==2.8.1

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`beautifulsoup4==4.9.1`
	`2`	`+cloudscraper>=1.2.71`
`2`	`3`	`dateparser==1.1.6`
`3`	`4`	`jdatetime==3.6.2`
`4`	`5`	`python-dateutil==2.8.1`