diff --git a/README.md b/README.md index bf590319..274fc85d 100644 --- a/README.md +++ b/README.md @@ -42,13 +42,14 @@ from basketball_reference_web_scraper.data import Team ## API -This client has seven methods +This client has eight methods * Getting player box scores by a date (`client.player_box_scores`) * Getting team box scores by a date (`client.team_box_scores`) * Getting the schedule for a season (`client.season_schedule`) * Getting players totals for a season (`client.players_season_totals`) * Getting players advanced season statistics for a season (`client.players_advanced_season_totals`) * Getting regular season box scores for a given player and season (`client.regular_season_player_box_scores`) +* Getting the salaries of players of a team for a season (`client.team_salaries`) * Searching (`client.search`) You can see all methods used in [this `repl`]()https://repl.it/@jaebradley/v300api-examples). @@ -179,6 +180,21 @@ The `player_identifier` is Basketball Reference's unique identifier for each pla his `player_identifier` is `westbru01` (you can see this from his player page URL: `https://www.basketball-reference.com/players/w/westbru01/gamelog/2020`) +### Get salary data for a team in a particular season + +```python +from basketball_reference_web_scraper import client +from basketball_reference.data import Team + +# Get salaries of all the players on the 1997-1998 Bulls team +client.team_salaries( + team=Team.CHICAGO_BULLS, + 1998 +) + +# The team_salaries method supports all output behavior previously described +``` + ### Search ```python diff --git a/basketball_reference_web_scraper/client.py b/basketball_reference_web_scraper/client.py index fca35e8e..e6519f9c 100644 --- a/basketball_reference_web_scraper/client.py +++ b/basketball_reference_web_scraper/client.py @@ -7,7 +7,8 @@ from basketball_reference_web_scraper.writers import CSVWriter, RowFormatter, \ BOX_SCORE_COLUMN_NAMES, SCHEDULE_COLUMN_NAMES, PLAYER_SEASON_TOTALS_COLUMN_NAMES, \ PLAYER_ADVANCED_SEASON_TOTALS_COLUMN_NAMES, TEAM_BOX_SCORES_COLUMN_NAMES, PLAY_BY_PLAY_COLUMN_NAMES, \ - PLAYER_SEASON_BOX_SCORE_COLUMN_NAMES, SearchResultsCSVWriter, SEARCH_RESULTS_COLUMN_NAMES + PLAYER_SEASON_BOX_SCORE_COLUMN_NAMES, SearchResultsCSVWriter, SEARCH_RESULTS_COLUMN_NAMES, \ + SALARY_COLUMN_NAMES def player_box_scores(day, month, year, output_type=None, output_file_path=None, output_write_option=None, @@ -133,6 +134,27 @@ def players_advanced_season_totals(season_end_year, include_combined_values=Fals json_options=json_options, ) +def team_salaries(team, season_end_year, output_type=None, output_file_path=None, output_write_option=None, + json_options=None): + try: + http_service = HTTPService(parser=ParserService()) + values = http_service.team_salaries(team, season_end_year) + except requests.exceptions.HTTPError as http_error: + if http_error.response.status_code == requests.codes.not_found: + raise InvalidSeason(season_end_year=season_end_year) + else: + raise http_error + return output( + values=values, + output_type=output_type, + output_file_path=output_file_path, + output_write_option=output_write_option, + csv_writer=CSVWriter( + column_names=SALARY_COLUMN_NAMES, + row_formatter=RowFormatter(data_field_names=SALARY_COLUMN_NAMES) + ), + json_options=json_options, + ) def team_box_scores(day, month, year, output_type=None, output_file_path=None, output_write_option=None, json_options=None): diff --git a/basketball_reference_web_scraper/html.py b/basketball_reference_web_scraper/html.py index 3513d4e3..060c04a3 100644 --- a/basketball_reference_web_scraper/html.py +++ b/basketball_reference_web_scraper/html.py @@ -1,5 +1,7 @@ import re +from basketball_reference_web_scraper.utilities import extract_html_obj_in_comment + class BasicBoxScoreRow: def __init__(self, html): @@ -1090,3 +1092,38 @@ def totals_table(self): return PlayerPageTotalsTable(html=totals_tables[0]) return None + +class PlayerSalaryRow: + def __init__(self, html, row_index): + self.html = html + self.index = row_index + + @property + def name(self): + return self.html.xpath('//td[@data-stat="player"]')[self.index].text_content() + + @property + def salary(self): + salary_td = self.html.xpath('//td[@data-stat="salary"]')[self.index] + return salary_td.get('csk') + +class TeamSalaryTable: + def __init__(self, html): + self.html = html + + @property + def rows(self): + # basketball-reference does this weird thing where it puts table data in + # comments on the HTML doc which then gets added to the DOM (I'm guessing) + # after a certain amount of time. I assume it is an attempt to make scraping + # more difficult. This is evidenced by the fact that if you attempt to load + # a page on a team with Javascript disabled you will not be able to see all + # the tables. To get around this we just read from the comments. + salary_table = extract_html_obj_in_comment(self.html, '//table[@id="salaries2"]') + header = salary_table.xpath('//tr')[0] + header.getparent().remove(header) + row_tags = salary_table.xpath('//tr//th[@class="center"]') + return [ + PlayerSalaryRow(html=row_html, row_index=i) + for i, row_html in enumerate(salary_table.xpath('//tr')) + ] diff --git a/basketball_reference_web_scraper/http_service.py b/basketball_reference_web_scraper/http_service.py index 9dc25039..67fba474 100644 --- a/basketball_reference_web_scraper/http_service.py +++ b/basketball_reference_web_scraper/http_service.py @@ -5,7 +5,8 @@ from basketball_reference_web_scraper.errors import InvalidDate, InvalidPlayerAndSeason from basketball_reference_web_scraper.html import DailyLeadersPage, PlayerSeasonBoxScoresPage, PlayerSeasonTotalTable, \ PlayerAdvancedSeasonTotalsTable, PlayByPlayPage, SchedulePage, BoxScoresPage, DailyBoxScoresPage, SearchPage, \ - PlayerPage + PlayerPage, TeamSalaryTable + class HTTPService: @@ -98,6 +99,21 @@ def players_season_totals(self, season_end_year): table = PlayerSeasonTotalTable(html=html.fromstring(response.content)) return self.parser.parse_player_season_totals(totals=table.rows) + def team_salaries(self, team, season_end_year): + url = '{BASE_URL}/teams/{team_abbr}/{end_year}.html'.format( + BASE_URL=HTTPService.BASE_URL, + team_abbr=TEAM_TO_TEAM_ABBREVIATION[team], + end_year=season_end_year + ) + + response = requests.get(url=url) + + response.raise_for_status() + + table = TeamSalaryTable(html=html.fromstring(response.content)) + + return self.parser.parse_team_salary(player_salaries=table.rows) + def schedule_for_month(self, url): response = requests.get(url=url) diff --git a/basketball_reference_web_scraper/parser_service.py b/basketball_reference_web_scraper/parser_service.py index 9380ddc4..24aaac67 100644 --- a/basketball_reference_web_scraper/parser_service.py +++ b/basketball_reference_web_scraper/parser_service.py @@ -5,7 +5,7 @@ SecondsPlayedParser, PlayerBoxScoresParser, PlayerAdvancedSeasonTotalsParser, PeriodDetailsParser, \ PeriodTimestampParser, ScoresParser, PlayByPlaysParser, TeamNameParser, ScheduledStartTimeParser, \ ScheduledGamesParser, PlayerBoxScoreOutcomeParser, PlayerSeasonBoxScoresParser, SearchResultNameParser, \ - ResourceLocationParser, SearchResultsParser, LeagueAbbreviationParser, PlayerDataParser + ResourceLocationParser, SearchResultsParser, LeagueAbbreviationParser, PlayerDataParser, TeamSalaryParser class ParserService: @@ -76,6 +76,7 @@ def __init__(self): league_abbreviation_parser=self.league_abbreviation_parser, ) self.team_totals_parser = TeamTotalsParser(team_abbreviation_parser=self.team_abbreviation_parser) + self.team_salary_parser = TeamSalaryParser() def parse_play_by_plays(self, play_by_plays, away_team_name, home_team_name): return self.play_by_plays_parser.parse( @@ -106,4 +107,7 @@ def parse_player_search_results(self, nba_aba_baa_players): return self.search_results_parser.parse(nba_aba_baa_players=nba_aba_baa_players) def parse_player_data(self, player): - return self.player_data_parser.parse(player=player) \ No newline at end of file + return self.player_data_parser.parse(player=player) + + def parse_team_salary(self, player_salaries): + return self.team_salary_parser.parse(player_salaries=player_salaries) diff --git a/basketball_reference_web_scraper/parsers.py b/basketball_reference_web_scraper/parsers.py index 16ec9005..40bc9228 100644 --- a/basketball_reference_web_scraper/parsers.py +++ b/basketball_reference_web_scraper/parsers.py @@ -552,3 +552,7 @@ def parse(self, player): ) ) } + +class TeamSalaryParser: + def parse(self, player_salaries): + return [{'name': row.name, 'salary': int(row.salary)} for row in player_salaries] diff --git a/basketball_reference_web_scraper/utilities.py b/basketball_reference_web_scraper/utilities.py index ab008c42..8ec603da 100644 --- a/basketball_reference_web_scraper/utilities.py +++ b/basketball_reference_web_scraper/utilities.py @@ -1,3 +1,5 @@ +from lxml import etree, html + def str_to_int(value, default=int(0)): stripped_value = value.strip() try: @@ -18,3 +20,10 @@ def merge_two_dicts(first, second): combined = first.copy() combined.update(second) return combined + +def extract_html_obj_in_comment(html_tree, xpath): + for node in html_tree.iter(etree.Comment): + comment = node.text + extracted_html = html.fromstring(comment) + if extracted_html.xpath(xpath): + return extracted_html diff --git a/basketball_reference_web_scraper/writers.py b/basketball_reference_web_scraper/writers.py index e25d9f22..353937ef 100644 --- a/basketball_reference_web_scraper/writers.py +++ b/basketball_reference_web_scraper/writers.py @@ -7,8 +7,7 @@ # without doing it this way SHARED_COLUMN_NAMES = [ - "team", - "location", + "team", "location", "opponent", "outcome", "seconds_played", @@ -130,6 +129,11 @@ "leagues", ] +SALARY_COLUMN_NAMES = [ + "name", + "salary", +] + class WriteOptions: def __init__(self, file_path=None, mode=None, custom_options=None):