From 1101b041618a53169d6d3d6d93e8cca26376a18d Mon Sep 17 00:00:00 2001 From: Param Gattupalli Date: Fri, 27 Sep 2024 23:00:07 -0400 Subject: [PATCH 1/2] Added get_roster function that takes in team abbreviation and year to return roster of team from that year --- basketball_reference_web_scraper/client.py | 37 +++++++++++++++++-- basketball_reference_web_scraper/errors.py | 5 +++ basketball_reference_web_scraper/html.py | 11 ++++++ .../http_service.py | 17 ++++++++- bin/normalizer | 4 +- bin/pip | 4 +- bin/pip3 | 4 +- 7 files changed, 75 insertions(+), 7 deletions(-) diff --git a/basketball_reference_web_scraper/client.py b/basketball_reference_web_scraper/client.py index 2694c5f3..95ad5dac 100644 --- a/basketball_reference_web_scraper/client.py +++ b/basketball_reference_web_scraper/client.py @@ -1,6 +1,6 @@ import requests -from basketball_reference_web_scraper.errors import InvalidSeason, InvalidDate, InvalidPlayerAndSeason +from basketball_reference_web_scraper.errors import InvalidSeason, InvalidDate, InvalidPlayerAndSeason, InvalidTeam from basketball_reference_web_scraper.http_service import HTTPService from basketball_reference_web_scraper.output.columns import BOX_SCORE_COLUMN_NAMES, SCHEDULE_COLUMN_NAMES, \ PLAYER_SEASON_TOTALS_COLUMN_NAMES, \ @@ -11,8 +11,8 @@ from basketball_reference_web_scraper.output.writers import CSVWriter, JSONWriter, FileOptions, OutputOptions, \ SearchCSVWriter from basketball_reference_web_scraper.parser_service import ParserService - - +from datetime import datetime +from basketball_reference_web_scraper.data import TEAM_TO_TEAM_ABBREVIATION def standings(season_end_year, output_type=None, output_file_path=None, output_write_option=None, json_options=None): try: @@ -212,6 +212,35 @@ def team_box_scores(day, month, year, output_type=None, output_file_path=None, o ) return output_service.output(data=values, options=options) +def get_roster(team, year=None, output_type=None, output_file_path=None, output_write_option=None, json_options=None): + try: + http_service = HTTPService(parser=ParserService()) + if year == None: + today = datetime.now() + year = today.year + if today.month >=7: + year += 1 + if len(team) > 3: + team=TEAM_TO_TEAM_ABBREVIATION[team.upper()] + values=http_service.get_team_roster(team=team, year=year) + except requests.exceptions.HTTPError as http_error: + if http_error.response.status_code == requests.codes.not_found: + raise InvalidTeam(team=team, year=year) + else: + raise http_error + + options = OutputOptions.of( + file_options=FileOptions.of(path=output_file_path, mode=output_write_option), + output_type=output_type, + json_options=json_options, + csv_options={"column_names": "Players"} + ) + + output_service = OutputService( + json_writer=JSONWriter(value_formatter=BasketballReferenceJSONEncoder), + csv_writer=CSVWriter(value_formatter=format_value) + ) + return output_service.output(data=values, options=options) def play_by_play(home_team, day, month, year, output_type=None, output_file_path=None, output_write_option=None, json_options=None): @@ -250,3 +279,5 @@ def search(term, output_type=None, output_file_path=None, output_write_option=No csv_writer=SearchCSVWriter(value_formatter=format_value) ) return output_service.output(data=values, options=options) + + diff --git a/basketball_reference_web_scraper/errors.py b/basketball_reference_web_scraper/errors.py index 12b574f8..0638197e 100644 --- a/basketball_reference_web_scraper/errors.py +++ b/basketball_reference_web_scraper/errors.py @@ -20,3 +20,8 @@ def __init__(self, player_identifier, season_end_year): message = "Player with identifier \"{player_identifier}\" in season ending in {season_end_year} is invalid" \ .format(player_identifier=player_identifier, season_end_year=season_end_year) super().__init__(message) + +class InvalidTeam(Exception): + def __init__(self, team, year): + message = "Team \"{team}\" in {year} is invalid".format(team=team, year=year) + super().__init__(message) \ No newline at end of file diff --git a/basketball_reference_web_scraper/html.py b/basketball_reference_web_scraper/html.py index 8bb63b49..808fe641 100644 --- a/basketball_reference_web_scraper/html.py +++ b/basketball_reference_web_scraper/html.py @@ -870,6 +870,17 @@ def game_url_paths(self): game_links = self.html.xpath(self.game_url_paths_query) return [game_link.attrib['href'] for game_link in game_links] +class TeamRoster: + def __init__(self, html): + self.html = html + + @property + def roster_query(self): + return '//table[@id="roster"]//td[@data-stat="player"]' + @property + def team_roster(self): + players = self.html.xpath(self.roster_query) + return [player.text_content() for player in players] class SchedulePage: def __init__(self, html): diff --git a/basketball_reference_web_scraper/http_service.py b/basketball_reference_web_scraper/http_service.py index 466b566a..c628aa54 100644 --- a/basketball_reference_web_scraper/http_service.py +++ b/basketball_reference_web_scraper/http_service.py @@ -1,3 +1,5 @@ +from datetime import datetime, timezone + import requests from lxml import html @@ -5,7 +7,7 @@ from basketball_reference_web_scraper.errors import InvalidDate, InvalidPlayerAndSeason from basketball_reference_web_scraper.html import DailyLeadersPage, PlayerSeasonBoxScoresPage, PlayerSeasonTotalTable, \ PlayerAdvancedSeasonTotalsTable, PlayByPlayPage, SchedulePage, BoxScoresPage, DailyBoxScoresPage, SearchPage, \ - PlayerPage, StandingsPage + PlayerPage, StandingsPage, TeamRoster class HTTPService: @@ -194,6 +196,17 @@ def team_box_scores(self, day, month, year): for box_score in self.team_box_score(game_url_path=game_url_path) ] + def get_team_roster(self, team, year): + url = "{BASE_URL}/teams/{team}/{year}.html".format(BASE_URL=HTTPService.BASE_URL, team=team, year=year) + + response = requests.get(url=url) + + response.raise_for_status() + + page = TeamRoster(html=html.fromstring(response.content)) + return page.team_roster + + def search(self, term): response = requests.get( url="{BASE_URL}/search/search.fcgi".format(BASE_URL=HTTPService.BASE_URL), @@ -240,3 +253,5 @@ def search(self, term): return { "players": player_results } + + diff --git a/bin/normalizer b/bin/normalizer index a48793ed..406afba3 100755 --- a/bin/normalizer +++ b/bin/normalizer @@ -1,4 +1,6 @@ -#!/Users/jaebradley/projects/basketball_reference_web_scraper/bin/python3 +#!/bin/sh +'''exec' "/Users/paramgattupalli/Documents/Fall 2024/CEN 3031/basketball_reference_web_scraper/bin/python" "$0" "$@" +' ''' # -*- coding: utf-8 -*- import re import sys diff --git a/bin/pip b/bin/pip index eadc7df2..502f5b4b 100755 --- a/bin/pip +++ b/bin/pip @@ -1,4 +1,6 @@ -#!/Users/jaebradley/projects/basketball_reference_web_scraper/bin/python3 +#!/bin/sh +'''exec' "/Users/paramgattupalli/Documents/Fall 2024/CEN 3031/basketball_reference_web_scraper/bin/python" "$0" "$@" +' ''' # -*- coding: utf-8 -*- import re import sys diff --git a/bin/pip3 b/bin/pip3 index eadc7df2..502f5b4b 100755 --- a/bin/pip3 +++ b/bin/pip3 @@ -1,4 +1,6 @@ -#!/Users/jaebradley/projects/basketball_reference_web_scraper/bin/python3 +#!/bin/sh +'''exec' "/Users/paramgattupalli/Documents/Fall 2024/CEN 3031/basketball_reference_web_scraper/bin/python" "$0" "$@" +' ''' # -*- coding: utf-8 -*- import re import sys From cb39ff3ee069bce59e98cdbb688ae62cfc1e3162 Mon Sep 17 00:00:00 2001 From: Param Gattupalli Date: Sun, 15 Dec 2024 23:54:29 -0500 Subject: [PATCH 2/2] Created TeamSeasonPage and RosterRow classes to reimplement roster function. Also made season_end_year non-optional --- basketball_reference_web_scraper/client.py | 23 ++++--- basketball_reference_web_scraper/errors.py | 2 +- basketball_reference_web_scraper/html.py | 60 +++++++++++++++++-- .../http_service.py | 10 ++-- .../output/columns.py | 5 ++ 5 files changed, 76 insertions(+), 24 deletions(-) diff --git a/basketball_reference_web_scraper/client.py b/basketball_reference_web_scraper/client.py index 95ad5dac..8a669225 100644 --- a/basketball_reference_web_scraper/client.py +++ b/basketball_reference_web_scraper/client.py @@ -1,17 +1,17 @@ import requests -from basketball_reference_web_scraper.errors import InvalidSeason, InvalidDate, InvalidPlayerAndSeason, InvalidTeam +from basketball_reference_web_scraper.errors import InvalidSeason, InvalidDate, InvalidPlayerAndSeason, \ + InvalidTeamSeason from basketball_reference_web_scraper.http_service import HTTPService from basketball_reference_web_scraper.output.columns import BOX_SCORE_COLUMN_NAMES, SCHEDULE_COLUMN_NAMES, \ PLAYER_SEASON_TOTALS_COLUMN_NAMES, \ PLAYER_ADVANCED_SEASON_TOTALS_COLUMN_NAMES, TEAM_BOX_SCORES_COLUMN_NAMES, PLAY_BY_PLAY_COLUMN_NAMES, \ - PLAYER_SEASON_BOX_SCORE_COLUMN_NAMES, SEARCH_RESULTS_COLUMN_NAMES, STANDINGS_COLUMNS_NAMES + PLAYER_SEASON_BOX_SCORE_COLUMN_NAMES, SEARCH_RESULTS_COLUMN_NAMES, STANDINGS_COLUMNS_NAMES, ROSTER_COLUMN_NAMES from basketball_reference_web_scraper.output.fields import format_value, BasketballReferenceJSONEncoder from basketball_reference_web_scraper.output.service import OutputService from basketball_reference_web_scraper.output.writers import CSVWriter, JSONWriter, FileOptions, OutputOptions, \ SearchCSVWriter from basketball_reference_web_scraper.parser_service import ParserService -from datetime import datetime from basketball_reference_web_scraper.data import TEAM_TO_TEAM_ABBREVIATION def standings(season_end_year, output_type=None, output_file_path=None, output_write_option=None, json_options=None): @@ -212,20 +212,16 @@ def team_box_scores(day, month, year, output_type=None, output_file_path=None, o ) return output_service.output(data=values, options=options) -def get_roster(team, year=None, output_type=None, output_file_path=None, output_write_option=None, json_options=None): + +def roster(team, season_end_year, output_type=None, output_file_path=None, output_write_option=None, json_options=None): try: http_service = HTTPService(parser=ParserService()) - if year == None: - today = datetime.now() - year = today.year - if today.month >=7: - year += 1 if len(team) > 3: - team=TEAM_TO_TEAM_ABBREVIATION[team.upper()] - values=http_service.get_team_roster(team=team, year=year) + team = TEAM_TO_TEAM_ABBREVIATION[team.upper()] + values = http_service.get_team_roster(team=team, season_end_year=season_end_year) except requests.exceptions.HTTPError as http_error: if http_error.response.status_code == requests.codes.not_found: - raise InvalidTeam(team=team, year=year) + raise InvalidTeamSeason(team=team, year=season_end_year) else: raise http_error @@ -233,7 +229,7 @@ def get_roster(team, year=None, output_type=None, output_file_path=None, output_ file_options=FileOptions.of(path=output_file_path, mode=output_write_option), output_type=output_type, json_options=json_options, - csv_options={"column_names": "Players"} + csv_options={"column_names": ROSTER_COLUMN_NAMES} ) output_service = OutputService( @@ -242,6 +238,7 @@ def get_roster(team, year=None, output_type=None, output_file_path=None, output_ ) return output_service.output(data=values, options=options) + def play_by_play(home_team, day, month, year, output_type=None, output_file_path=None, output_write_option=None, json_options=None): try: diff --git a/basketball_reference_web_scraper/errors.py b/basketball_reference_web_scraper/errors.py index 0638197e..13a56bc3 100644 --- a/basketball_reference_web_scraper/errors.py +++ b/basketball_reference_web_scraper/errors.py @@ -21,7 +21,7 @@ def __init__(self, player_identifier, season_end_year): .format(player_identifier=player_identifier, season_end_year=season_end_year) super().__init__(message) -class InvalidTeam(Exception): +class InvalidTeamSeason(Exception): def __init__(self, team, year): message = "Team \"{team}\" in {year} is invalid".format(team=team, year=year) super().__init__(message) \ No newline at end of file diff --git a/basketball_reference_web_scraper/html.py b/basketball_reference_web_scraper/html.py index 808fe641..393686cd 100644 --- a/basketball_reference_web_scraper/html.py +++ b/basketball_reference_web_scraper/html.py @@ -870,17 +870,67 @@ def game_url_paths(self): game_links = self.html.xpath(self.game_url_paths_query) return [game_link.attrib['href'] for game_link in game_links] -class TeamRoster: + +class TeamSeasonPage: def __init__(self, html): self.html = html @property def roster_query(self): - return '//table[@id="roster"]//td[@data-stat="player"]' + return '//table[@id="roster"]' + + @property + def rows_query(self): + return '//table[@id="roster"]//tbody//tr' + @property - def team_roster(self): - players = self.html.xpath(self.roster_query) - return [player.text_content() for player in players] + def team_roster_table(self): + return self.html.xpath(self.roster_query) + + @property + def rows(self): + return [ + RosterRow(html=row_html) + for row_html in self.html.xpath(self.rows_query) + ] + + +class RosterRow(PlayerIdentificationRow): + def __init__(self, html): + super().__init__(html=html) + + @property + def number(self): + cells = self.html.xpath('.//td[@data-stat="number"]') + if len(cells) > 0: + return cells[0].text_content() + + return '' + + @property + def position_abbreviations(self): + cells = self.html.xpath('.//td[@data-stat="pos"]') + if len(cells) > 0: + return cells[0].text_content() + + return '' + + @property + def height(self): + cells = self.html.xpath('.//td[@data-stat="height"]') + if len(cells) > 0: + return cells[0].text_content() + + return '' + + @property + def weight(self): + cells = self.html.xpath('.//td[@data-stat="weight"]') + if len(cells) > 0: + return cells[0].text_content() + + return '' + class SchedulePage: def __init__(self, html): diff --git a/basketball_reference_web_scraper/http_service.py b/basketball_reference_web_scraper/http_service.py index c628aa54..afa3aad4 100644 --- a/basketball_reference_web_scraper/http_service.py +++ b/basketball_reference_web_scraper/http_service.py @@ -7,7 +7,7 @@ from basketball_reference_web_scraper.errors import InvalidDate, InvalidPlayerAndSeason from basketball_reference_web_scraper.html import DailyLeadersPage, PlayerSeasonBoxScoresPage, PlayerSeasonTotalTable, \ PlayerAdvancedSeasonTotalsTable, PlayByPlayPage, SchedulePage, BoxScoresPage, DailyBoxScoresPage, SearchPage, \ - PlayerPage, StandingsPage, TeamRoster + PlayerPage, StandingsPage, TeamSeasonPage class HTTPService: @@ -196,15 +196,15 @@ def team_box_scores(self, day, month, year): for box_score in self.team_box_score(game_url_path=game_url_path) ] - def get_team_roster(self, team, year): - url = "{BASE_URL}/teams/{team}/{year}.html".format(BASE_URL=HTTPService.BASE_URL, team=team, year=year) + def get_team_roster(self, team, season_end_year): + url = "{BASE_URL}/teams/{team}/{season_end_year}.html".format(BASE_URL=HTTPService.BASE_URL, team=team, season_end_year=season_end_year) response = requests.get(url=url) response.raise_for_status() - page = TeamRoster(html=html.fromstring(response.content)) - return page.team_roster + page = TeamSeasonPage(html=html.fromstring(response.content)) + return [{'slug': row.slug, 'name': row.name} for row in page.rows] def search(self, term): diff --git a/basketball_reference_web_scraper/output/columns.py b/basketball_reference_web_scraper/output/columns.py index 7c58d83a..753897b8 100644 --- a/basketball_reference_web_scraper/output/columns.py +++ b/basketball_reference_web_scraper/output/columns.py @@ -136,3 +136,8 @@ "division", "conference", ] + +ROSTER_COLUMN_NAMES = [ + "slug", + "name", +] \ No newline at end of file