Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 30 additions & 24 deletions pybaseball/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,31 +7,31 @@
from .teamid_lookup import team_ids
from .statcast import statcast, statcast_single_game
from .statcast_pitcher import (
statcast_pitcher,
statcast_pitcher_exitvelo_barrels,
statcast_pitcher_expected_stats,
statcast_pitcher_pitch_arsenal,
statcast_pitcher_arsenal_stats,
statcast_pitcher_percentile_ranks,
statcast_pitcher_spin_dir_comp
statcast_pitcher,
statcast_pitcher_exitvelo_barrels,
statcast_pitcher_expected_stats,
statcast_pitcher_pitch_arsenal,
statcast_pitcher_arsenal_stats,
statcast_pitcher_percentile_ranks,
statcast_pitcher_spin_dir_comp,
)
from .statcast_batter import (
statcast_batter,
statcast_batter_exitvelo_barrels,
statcast_batter_expected_stats,
statcast_batter_percentile_ranks,
statcast_batter_pitch_arsenal,
statcast_batter_bat_tracking
statcast_batter,
statcast_batter_exitvelo_barrels,
statcast_batter_expected_stats,
statcast_batter_percentile_ranks,
statcast_batter_pitch_arsenal,
statcast_batter_bat_tracking,
)
from .statcast_running import statcast_sprint_speed, statcast_running_splits
from .statcast_fielding import (
statcast_outs_above_average,
statcast_outfield_directional_oaa,
statcast_outfield_catch_prob,
statcast_outfielder_jump,
statcast_catcher_poptime,
statcast_catcher_framing,
statcast_fielding_run_value
statcast_outs_above_average,
statcast_outfield_directional_oaa,
statcast_outfield_catch_prob,
statcast_outfielder_jump,
statcast_catcher_poptime,
statcast_catcher_framing,
statcast_fielding_run_value,
)
from .league_batting_stats import batting_stats_bref
from .league_batting_stats import batting_stats_range
Expand Down Expand Up @@ -78,8 +78,9 @@
from .lahman import salaries
from .lahman import schools
from .lahman import series_post
from .lahman import teams_core
from .lahman import teams_upstream
from .lahman import teams

# from .lahman import teams_upstream Not part of Lahman
from .lahman import teams_franchises
from .lahman import teams_half
from .lahman import download_lahman
Expand All @@ -98,7 +99,12 @@
from .plotting import spraychart
from .plotting import plot_teams
from .plotting import plot_strike_zone
from .datasources.fangraphs import (fg_batting_data, fg_pitching_data, fg_team_batting_data, fg_team_fielding_data,
fg_team_pitching_data)
from .datasources.fangraphs import (
fg_batting_data,
fg_pitching_data,
fg_team_batting_data,
fg_team_fielding_data,
fg_team_pitching_data,
)
from .split_stats import get_splits
from .version import __version__
106 changes: 70 additions & 36 deletions pybaseball/lahman.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@

from . import cache

url = "https://github.com/chadwickbureau/baseballdatabank/archive/master.zip"
base_string = "baseballdatabank-master"
url = "https://github.com/jmaslek/LahmanDatabase/archive/refs/heads/main/baseballdb.zip"
base_string = "LahmanDatabase-main/baseballdb"

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jmaslek I didn't see a LahmanDatabase-main/baseballdb in your github archive at https://github.com/jmaslek/LahmanDatabase/archive/refs/heads/main/baseballdb.zip

However, I did see a LahmanDatabase-main/unzipped directory there. Should the base_string be "LahmanDatabase-main/unzipped" instead?

Suggested change
base_string = "LahmanDatabase-main/baseballdb"
base_string = "LahmanDatabase-main/unzipped"


_handle = None


def get_lahman_zip() -> Optional[ZipFile]:
# Retrieve the Lahman database zip file, returns None if file already exists in cwd.
# If we already have the zip file, keep re-using that.
Expand All @@ -25,6 +26,7 @@ def get_lahman_zip() -> Optional[ZipFile]:
_handle = ZipFile(BytesIO(s.content))
return _handle


def download_lahman():
# download entire lahman db to present working directory
z = get_lahman_zip()
Expand All @@ -34,103 +36,135 @@ def download_lahman():
# this way we'll now start using the extracted zip directory
# instead of the session ZipFile object

def _get_file(tablename: str, quotechar: str = "'") -> pd.DataFrame:

def _get_file(tablename: str, quotechar: str = "'", encoding="latin1") -> pd.DataFrame:
z = get_lahman_zip()
f = f'{base_string}/{tablename}'
f = f"{base_string}/{tablename}"
data = pd.read_csv(
f"{path.join(cache.config.cache_directory, f)}" if z is None else z.open(f),
header=0,
sep=',',
quotechar=quotechar
sep=",",
quotechar=quotechar,
encoding=encoding,
)
return data


# do this for every table in the lahman db so they can exist as separate functions
def parks() -> pd.DataFrame:
return _get_file('core/Parks.csv')
return _get_file("Parks.csv")


def all_star_full() -> pd.DataFrame:
return _get_file("core/AllstarFull.csv")
return _get_file("AllstarFull.csv")


def appearances() -> pd.DataFrame:
return _get_file("core/Appearances.csv")
return _get_file("Appearances.csv")


def awards_managers() -> pd.DataFrame:
return _get_file("contrib/AwardsManagers.csv")
return _get_file("AwardsManagers.csv")


def awards_players() -> pd.DataFrame:
return _get_file("contrib/AwardsPlayers.csv")
return _get_file("AwardsPlayers.csv")


def awards_share_managers() -> pd.DataFrame:
return _get_file("contrib/AwardsShareManagers.csv")
return _get_file("AwardsShareManagers.csv")


def awards_share_players() -> pd.DataFrame:
return _get_file("contrib/AwardsSharePlayers.csv")
return _get_file("AwardsSharePlayers.csv")


def batting() -> pd.DataFrame:
return _get_file("core/Batting.csv")
return _get_file("Batting.csv")


def batting_post() -> pd.DataFrame:
return _get_file("core/BattingPost.csv")
return _get_file("BattingPost.csv")


def college_playing() -> pd.DataFrame:
return _get_file("contrib/CollegePlaying.csv")
return _get_file("CollegePlaying.csv")


def fielding() -> pd.DataFrame:
return _get_file("core/Fielding.csv")
return _get_file("Fielding.csv")


def fielding_of() -> pd.DataFrame:
return _get_file("core/FieldingOF.csv")
return _get_file("FieldingOF.csv")


def fielding_of_split() -> pd.DataFrame:
return _get_file("core/FieldingOFsplit.csv")
return _get_file("FieldingOFsplit.csv")


def fielding_post() -> pd.DataFrame:
return _get_file("core/FieldingPost.csv")
return _get_file("FieldingPost.csv")


def hall_of_fame() -> pd.DataFrame:
return _get_file("contrib/HallOfFame.csv")
return _get_file("HallOfFame.csv")


def home_games() -> pd.DataFrame:
return _get_file("core/HomeGames.csv")
return _get_file("HomeGames.csv")


def managers() -> pd.DataFrame:
return _get_file("core/Managers.csv")
return _get_file("Managers.csv")


def managers_half() -> pd.DataFrame:
return _get_file("core/ManagersHalf.csv")
return _get_file("ManagersHalf.csv")


def master() -> pd.DataFrame:
# Alias for people -- the new name for master
return people()


def people() -> pd.DataFrame:
return _get_file("core/People.csv")
return _get_file("People.csv")


def pitching() -> pd.DataFrame:
return _get_file("core/Pitching.csv")
return _get_file("Pitching.csv")


def pitching_post() -> pd.DataFrame:
return _get_file("core/PitchingPost.csv")
return _get_file("PitchingPost.csv")


def salaries() -> pd.DataFrame:
return _get_file("contrib/Salaries.csv")
return _get_file("Salaries.csv")


def schools() -> pd.DataFrame:
return _get_file("contrib/Schools.csv", quotechar='"') # different here bc of doublequotes used in some school names
return _get_file(
"Schools.csv", quotechar='"'
) # different here bc of doublequotes used in some school names


def series_post() -> pd.DataFrame:
return _get_file("core/SeriesPost.csv")
return _get_file("SeriesPost.csv")


def teams_core() -> pd.DataFrame:
return _get_file("core/Teams.csv")
def teams() -> pd.DataFrame:
return _get_file("Teams.csv")


# def teams_upstream() -> pd.DataFrame:
# return _get_file("Teams.csv") # manually maintained file

def teams_upstream() -> pd.DataFrame:
return _get_file("upstream/Teams.csv") # manually maintained file

def teams_franchises() -> pd.DataFrame:
return _get_file("core/TeamsFranchises.csv")
return _get_file("TeamsFranchises.csv")


def teams_half() -> pd.DataFrame:
return _get_file("core/TeamsHalf.csv")
return _get_file("TeamsHalf.csv")