Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions basketball_reference_web_scraper/client.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
from typing import Any, Callable

import requests

from basketball_reference_web_scraper.contracts.data.models import PlayerContract
from basketball_reference_web_scraper.contracts.data.parsers import create_player_contract
from basketball_reference_web_scraper.errors import InvalidSeason, InvalidDate, InvalidPlayerAndSeason
from basketball_reference_web_scraper.http_service import HTTPService
from basketball_reference_web_scraper.output.columns import BOX_SCORE_COLUMN_NAMES, SCHEDULE_COLUMN_NAMES, \
Expand Down Expand Up @@ -250,3 +254,9 @@ def search(term, output_type=None, output_file_path=None, output_write_option=No
csv_writer=SearchCSVWriter(value_formatter=format_value)
)
return output_service.output(data=values, options=options)


def player_contracts(player_contract_processor: Callable[[PlayerContract], Any]):
HTTPService(parser=ParserService()).player_contracts(
player_contract_processor=lambda player_row_contract_data: player_contract_processor(
create_player_contract(player_row_contract_data)))
Empty file.
Empty file.
61 changes: 61 additions & 0 deletions basketball_reference_web_scraper/contracts/data/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from dataclasses import dataclass
from decimal import Decimal
from typing import Dict, Optional

from basketball_reference_web_scraper.data import Team


@dataclass(frozen=True)
class Player:
identifier: str
name: str

def __post_init__(self):
if 0 == len(self.identifier):
raise ValueError("identifier should not be an empty string")

if 0 == len(self.name):
raise ValueError("name should not be an empty string")

if any(char.isspace() for char in self.identifier):
raise ValueError(f"identifier: {self.identifier} should not contain whitespace")


@dataclass(frozen=True)
class Salary:
amount: Decimal
currency: str

def __post_init__(self):
if self.amount is None:
raise ValueError("amount should not be None")

if 0 > self.amount:
raise ValueError("amount should not be negative")


@dataclass(frozen=True)
class PlayerContract:
player: Player
team: Team
salaries_by_season_start_year: Dict[int, Optional[Salary]]
guaranteed_salary: Salary

def __post_init__(self):
if self.player is None:
raise ValueError("player should not be None")

if self.team is None:
raise ValueError("team should not be None")

if self.salaries_by_season_start_year is None:
raise ValueError("season salaries should not be None")

if 0 == len(self.salaries_by_season_start_year):
raise ValueError("season salaries should not be empty")

if self.guaranteed_salary is None:
raise ValueError("guaranteed salary should not be None")

if all(salary is None for salary in self.salaries_by_season_start_year.values()):
raise ValueError("not all salaries should be None")
59 changes: 59 additions & 0 deletions basketball_reference_web_scraper/contracts/data/parsers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from datetime import datetime
from typing import Dict

from coverage.types import Protocol
from price_parser import Price

from basketball_reference_web_scraper.contracts.data.models import Salary, PlayerContract, Player
from basketball_reference_web_scraper.contracts.page.parsers import PlayerRowData, PlayerContractData
from basketball_reference_web_scraper.data import TEAM_ABBREVIATIONS_TO_TEAM

GUARANTEED_SALARY_COLUMN_DATA_STAT_VALUE = "remain_gtd"


class PlayerContractRowDataProcessor(Protocol):
def process_row(self, headers: Dict[str, str], row_data: PlayerRowData) -> PlayerContract:
raise NotImplementedError()


def parse_season_start_date(serialized_season: str) -> int:
return datetime.strptime(serialized_season, "%Y-%y").year


def parse_player_contract_values(contract_values_by_column_identifier: Dict[str, str],
column_names_by_identifier: Dict[str, str]):
guaranteed_salary_value = contract_values_by_column_identifier.get(GUARANTEED_SALARY_COLUMN_DATA_STAT_VALUE, None)
if guaranteed_salary_value:
parsed_guaranteed_salary = Price.fromstring(price=guaranteed_salary_value)

return (
dict(map(lambda item: (
item[0], None if item[1] is None else Salary(amount=item[1].amount, currency=item[1].currency)),
map(lambda item: (item[0], None if item[1] is None else Price.fromstring(item[1])),
map(lambda item: (parse_season_start_date(item[0]), item[1]),
map(lambda item: (column_names_by_identifier.get(item[0]), item[1]),
filter(lambda item: item[0] != GUARANTEED_SALARY_COLUMN_DATA_STAT_VALUE,
contract_values_by_column_identifier.items())))))),
Salary(
amount=parsed_guaranteed_salary.amount,
currency=parsed_guaranteed_salary.currency
))

raise ValueError("Unparseable player contract values")


def create_player_contract(player_contract_data: PlayerContractData):
salaries_by_season, guaranteed_salary = parse_player_contract_values(
contract_values_by_column_identifier=player_contract_data.row.values_by_header,
column_names_by_identifier=player_contract_data.headers
)

return PlayerContract(
player=Player(
identifier=player_contract_data.row.id,
name=player_contract_data.row.name
),
team=TEAM_ABBREVIATIONS_TO_TEAM.get(player_contract_data.row.team_abbreviation, None),
salaries_by_season_start_year=salaries_by_season,
guaranteed_salary=guaranteed_salary
)
Empty file.
81 changes: 81 additions & 0 deletions basketball_reference_web_scraper/contracts/page/parsers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
from collections import defaultdict
from dataclasses import dataclass
from typing import Optional, Dict, Callable

from lxml.etree import HTMLPullParser, LxmlError

from basketball_reference_web_scraper.contracts.data.models import PlayerContract


@dataclass(frozen=False)
class PlayerRowData:
id: Optional[str]
name: Optional[str]
team_abbreviation: Optional[str]
values_by_header: Dict[str, Optional[str]]


@dataclass(frozen=True)
class PlayerContractData:
row: PlayerRowData
headers: Dict[str, str]


class NothingMoreToParse(StopIteration):
"""
Custom exception to indicate player contract data from player contracts page HTML has been fully parsed
"""
pass


class PlayerContractsPageParser:
def __init__(self, player_contract_data_processor: Callable[[PlayerContractData], PlayerContract]):
self._data_processor = player_contract_data_processor
self._seen_table = False
self._headers = defaultdict(str)
self._current_player_data = PlayerRowData(id=None, name=None, team_abbreviation=None, values_by_header={})

def __enter__(self):
self._html_parser = HTMLPullParser(events=["start", "end"], tag=["table", "th", "tr", "td"])
return self

def __exit__(self, exc_type, exc_val, exc_tb):
try:
self._html_parser.close()
except LxmlError:
# LxmlErrors can occur when forcibly stopping processing (i.e. closing the parser) since there's no more
# relevant data to parse
pass

def parse(self, chunk) -> None:
self._html_parser.feed(str(chunk))

for event, element in self._html_parser.read_events():
if event == "start" and element.tag == "table" and element.attrib.get("id") == "player-contracts":
self._seen_table = True
elif self._seen_table:
if event == "end" and element.tag == "th" and element.attrib.get(
"data-stat") is not None and element.attrib.get("scope") == "col":
self._headers[element.attrib.get('data-stat')] = element.text
element.clear(keep_tail=True)
elif event == "start" and element.tag == "tr" and element.attrib.get("class") is None:
self._current_player_data = PlayerRowData(id=None, name=None, team_abbreviation=None,
values_by_header={})
elif event == "end" and element.tag == "tr" and element.attrib.get("class") is None:
element.clear(keep_tail=True)
if self._current_player_data.id is not None:
self._data_processor(PlayerContractData(
row=self._current_player_data,
headers=self._headers
))
elif event == "end" and element.tag == "td" and element.attrib.get("data-stat") is not None:
if element.attrib.get('data-stat') == "player":
self._current_player_data.id = element.attrib.get('data-append-csv')
self._current_player_data.name = "".join(element.itertext())
elif element.attrib.get('data-stat') == "team_id":
self._current_player_data.team_abbreviation = "".join(element.itertext())
else:
self._current_player_data.values_by_header[element.attrib.get('data-stat')] = element.text
element.clear(keep_tail=True)
elif event == "end" and element.tag == "table" and element.attrib.get("id") == "player-contracts":
raise NothingMoreToParse()
43 changes: 37 additions & 6 deletions basketball_reference_web_scraper/http_service.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,21 @@
from typing import Callable

import requests
from lxml import html

from basketball_reference_web_scraper.data import TEAM_TO_TEAM_ABBREVIATION, TeamTotal, PlayerData
from basketball_reference_web_scraper.errors import InvalidDate, InvalidPlayerAndSeason
from basketball_reference_web_scraper.html import DailyLeadersPage, PlayerSeasonBoxScoresPage, PlayerSeasonTotalTable, \
from basketball_reference_web_scraper.content import DailyLeadersPage, PlayerSeasonBoxScoresPage, \
PlayerSeasonTotalTable, \
PlayerAdvancedSeasonTotalsTable, PlayByPlayPage, SchedulePage, BoxScoresPage, DailyBoxScoresPage, SearchPage, \
PlayerPage, StandingsPage
from basketball_reference_web_scraper.contracts.data.models import PlayerContract
from basketball_reference_web_scraper.contracts.page.parsers import PlayerContractsPageParser, NothingMoreToParse, \
PlayerContractData
from basketball_reference_web_scraper.data import TEAM_TO_TEAM_ABBREVIATION, TeamTotal, PlayerData
from basketball_reference_web_scraper.errors import InvalidDate, InvalidPlayerAndSeason


class CouldNotGetPlayerContractData(Exception):
pass


class HTTPService:
Expand All @@ -26,7 +36,7 @@ def standings(self, season_end_year):

page = StandingsPage(html=html.fromstring(response.content))
return self.parser.parse_division_standings(standings=page.division_standings.eastern_conference_table.rows) + \
self.parser.parse_division_standings(standings=page.division_standings.western_conference_table.rows)
self.parser.parse_division_standings(standings=page.division_standings.western_conference_table.rows)

def player_box_scores(self, day, month, year):
url = '{BASE_URL}/friv/dailyleaders.cgi?month={month}&day={day}&year={year}'.format(
Expand Down Expand Up @@ -65,7 +75,8 @@ def regular_season_player_box_scores(self, player_identifier, season_end_year, i
if page.regular_season_box_scores_table is None:
raise InvalidPlayerAndSeason(player_identifier=player_identifier, season_end_year=season_end_year)

return self.parser.parse_player_season_box_scores(box_scores=page.regular_season_box_scores_table.rows, include_inactive_games=include_inactive_games)
return self.parser.parse_player_season_box_scores(box_scores=page.regular_season_box_scores_table.rows,
include_inactive_games=include_inactive_games)

def playoff_player_box_scores(self, player_identifier, season_end_year, include_inactive_games=False):
# Makes assumption that basketball reference pattern of breaking out player pathing using first character of
Expand All @@ -86,7 +97,8 @@ def playoff_player_box_scores(self, player_identifier, season_end_year, include_
if page.playoff_box_scores_table is None:
raise InvalidPlayerAndSeason(player_identifier=player_identifier, season_end_year=season_end_year)

return self.parser.parse_player_season_box_scores(box_scores=page.playoff_box_scores_table.rows, include_inactive_games=include_inactive_games)
return self.parser.parse_player_season_box_scores(box_scores=page.playoff_box_scores_table.rows,
include_inactive_games=include_inactive_games)

def play_by_play(self, home_team, day, month, year):
add_0_if_needed = lambda s: "0" + s if len(s) == 1 else s
Expand Down Expand Up @@ -240,3 +252,22 @@ def search(self, term):
return {
"players": player_results
}

def player_contracts(self, player_contract_processor: Callable[[PlayerContractData], PlayerContract]) -> None:
with requests.get(
url=f"{HTTPService.BASE_URL}/contracts/players.html",
stream=True,

) as response:
if not response.ok:
raise CouldNotGetPlayerContractData()

if response.encoding is None:
response.encoding = 'utf-8'

with PlayerContractsPageParser(player_contract_data_processor=player_contract_processor) as p:
for chunk in response.iter_content(chunk_size=500, decode_unicode=True):
try:
p.parse(chunk=chunk)
except NothingMoreToParse:
break
Loading
Loading