%%capture
pip install pmdarima


# Suppress
%%capture

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from statsmodels.tsa.arima.model import ARIMA
from pmdarima import auto_arima
from statsmodels.tsa.stattools import adfuller


warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=Warning)


data = pd.read_csv("/content/stats.csv")
print(data.describe())
print(data.shape)

           player_id         year           pa    k_percent   bb_percent  \
count    1106.000000  1106.000000  1106.000000  1106.000000  1106.000000   
mean   562075.849910  2019.299277   643.334539    22.185986     7.434448   
std     86158.967831     2.897329   160.593994     4.951807     1.962328   
min    112526.000000  2015.000000   189.000000    10.400000     2.000000   
25%    502171.000000  2017.000000   563.250000    18.800000     6.100000   
50%    579328.000000  2019.000000   669.000000    21.500000     7.300000   
75%    622608.000000  2022.000000   758.000000    25.100000     8.700000   
max    694973.000000  2024.000000   951.000000    41.100000    17.900000   

              woba        bacon  z_swing_percent  z_swing_miss_percent  \
count  1106.000000  1106.000000      1106.000000           1106.000000   
mean      0.310686     0.325090        66.860036             16.652080   
std       0.031227     0.026412         3.088641              3.642283   
min       0.204000     0.242000        55.800000              6.600000   
25%       0.290000     0.307000        64.800000             14.125000   
50%       0.312000     0.325000        66.900000             16.550000   
75%       0.332000     0.344000        68.900000             18.900000   
max       0.417000     0.398000        76.100000             31.700000   

       oz_swing_percent  ...  flyballs_percent  n_ff_formatted  ff_avg_speed  \
count       1106.000000  ...       1106.000000     1069.000000   1069.000000   
mean          28.857414  ...         23.410127       33.876239     92.871656   
std            3.004760  ...          4.879343       16.366793      2.354689   
min           19.700000  ...          9.800000        0.000000     82.300000   
25%           26.900000  ...         20.000000       22.600000     91.500000   
50%           28.800000  ...         23.400000       35.600000     92.900000   
75%           30.700000  ...         26.800000       46.300000     94.400000   
max           39.800000  ...         38.800000       72.100000     99.100000   

       ff_avg_spin  ff_avg_break_x  ff_avg_break_z  offspeed_avg_break_z  \
count  1069.000000     1069.000000     1069.000000           1086.000000   
mean   2247.242283       -3.038728      -16.380449            -30.807919   
std     145.896179        7.532372        3.024717              4.034539   
min    1792.000000      -15.700000      -31.300000            -43.300000   
25%    2150.000000       -8.500000      -17.900000            -33.575000   
50%    2248.000000       -5.800000      -16.100000            -30.800000   
75%    2348.000000        3.000000      -14.300000            -28.100000   
max    2779.000000       18.100000       -9.200000            -15.400000   

       offspeed_avg_break_z_induced  offspeed_avg_break  offspeed_range_speed  
count                   1086.000000         1086.000000           1083.000000  
mean                       7.318508           15.816298              1.583564  
std                        3.744186            2.712864              0.429232  
min                       -5.600000            4.000000              0.900000  
25%                        4.725000           14.300000              1.300000  
50%                        7.400000           16.000000              1.500000  
75%                        9.600000           17.600000              1.700000  
max                       22.300000           25.000000              6.300000  

[8 rows x 36 columns]
(1106, 37)


print(data.isnull().sum())

last_name, first_name            0
player_id                        0
year                             0
pa                               0
k_percent                        0
bb_percent                       0
woba                             0
bacon                            0
z_swing_percent                  0
z_swing_miss_percent             0
oz_swing_percent                 0
oz_swing_miss_percent            0
oz_contact_percent               0
out_zone_swing_miss              0
out_zone_swing                   0
out_zone_percent                 0
out_zone                         0
meatball_swing_percent           0
meatball_percent                 0
pitch_count_offspeed             0
whiff_percent                    0
swing_percent                    0
straightaway_percent             0
batted_ball                      0
f_strike_percent                 0
groundballs_percent              0
groundballs                      0
flyballs_percent                 0
n_ff_formatted                  37
ff_avg_speed                    37
ff_avg_spin                     37
ff_avg_break_x                  37
ff_avg_break_z                  37
offspeed_avg_break_z            20
offspeed_avg_break_z_induced    20
offspeed_avg_break              20
offspeed_range_speed            23
dtype: int64


# Columns with missing values
columns_with_missing = ['n_ff_formatted', 'ff_avg_speed', 'ff_avg_spin',
                        'ff_avg_break_x', 'ff_avg_break_z', 'offspeed_avg_break_z',
                        'offspeed_avg_break_z_induced', 'offspeed_avg_break', 'offspeed_range_speed']

# Plot histograms for columns with missing values
plt.figure(figsize=(15,10))

for i, col in enumerate(columns_with_missing, 1):
    plt.subplot(3, 3, i)
    sns.histplot(data[col], kde=True, bins=20)
    plt.title(f'{col} Distribution')

plt.tight_layout()
plt.show()


# Impute using the mean for normally distributed columns
columns_mean = ['ff_avg_speed', 'ff_avg_spin', 'offspeed_avg_break']
for col in columns_mean:
    data[col].fillna(data[col].mean(), inplace=True)

# Impute using the median for skewed columns
columns_median = ['n_ff_formatted', 'ff_avg_break_x', 'ff_avg_break_z',
                  'offspeed_avg_break_z', 'offspeed_avg_break_z_induced', 'offspeed_range_speed']
for col in columns_median:
    data[col].fillna(data[col].median(), inplace=True)

# Confirm that no missing values remain
print(data.isnull().sum())

last_name, first_name           0
player_id                       0
year                            0
pa                              0
k_percent                       0
bb_percent                      0
woba                            0
bacon                           0
z_swing_percent                 0
z_swing_miss_percent            0
oz_swing_percent                0
oz_swing_miss_percent           0
oz_contact_percent              0
out_zone_swing_miss             0
out_zone_swing                  0
out_zone_percent                0
out_zone                        0
meatball_swing_percent          0
meatball_percent                0
pitch_count_offspeed            0
whiff_percent                   0
swing_percent                   0
straightaway_percent            0
batted_ball                     0
f_strike_percent                0
groundballs_percent             0
groundballs                     0
flyballs_percent                0
n_ff_formatted                  0
ff_avg_speed                    0
ff_avg_spin                     0
ff_avg_break_x                  0
ff_avg_break_z                  0
offspeed_avg_break_z            0
offspeed_avg_break_z_induced    0
offspeed_avg_break              0
offspeed_range_speed            0
dtype: int64


plt.subplot(1, 3, 1)
sns.histplot(data['woba'], kde=True, bins=20)
plt.title('wOBA Distribution')

Text(0.5, 1.0, 'wOBA Distribution')


# Strikeout Percentage (K%) distribution
plt.subplot(1, 3, 2)
sns.histplot(data['k_percent'], kde=True, bins=20)
plt.title('K% Distribution')

Text(0.5, 1.0, 'K% Distribution')


# Walk Percentage (BB%) distribution
plt.subplot(1, 3, 3)
sns.histplot(data['bb_percent'], kde=True, bins=20)
plt.title('BB% Distribution')

plt.tight_layout()
plt.show()


# Plot the distribution of Whiff Percentage (whiff_percent)
plt.figure(figsize=(5,5))
sns.histplot(data['whiff_percent'], kde=True, bins=20)
plt.title('Whiff% Distribution')
plt.xlabel('Whiff Percentage')
plt.ylabel('Count')
plt.show()


# Initialize the figure for subplots with 4 rows, 1 column
fig, axes = plt.subplots(4, 1, figsize=(10, 16))

# 1. Facet for Whiff%
sns.lineplot(data=data, x='year', y='whiff_percent', ax=axes[0], ci=None)
axes[0].set_title('Whiff% Over Time')
axes[0].set_xlabel('Year')
axes[0].set_ylabel('Whiff%')

# 2. Facet for wOBA
sns.lineplot(data=data, x='year', y='woba', ax=axes[1], ci=None)
axes[1].set_title('wOBA Over Time')
axes[1].set_xlabel('Year')
axes[1].set_ylabel('wOBA')

# 3. Facet for K% (Strikeout Percentage)
sns.lineplot(data=data, x='year', y='k_percent', ax=axes[2], ci=None)
axes[2].set_title('K% (Strikeout Percentage) Over Time')
axes[2].set_xlabel('Year')
axes[2].set_ylabel('K%')

# 4. Facet for BB% (Walk Percentage)
sns.lineplot(data=data, x='year', y='bb_percent', ax=axes[3], ci=None)
axes[3].set_title('BB% (Walk Percentage) Over Time')
axes[3].set_xlabel('Year')
axes[3].set_ylabel('BB%')

# Adjust layout
plt.tight_layout()
plt.show()


def forecast_metric(data, metric_column, metric_name, forecast_years=5):
    # Step 1: Aggregate the metric data by year
    metric_by_year = data.groupby('year')[metric_column].mean()

    # Step 2: Plot the original time series for the metric
    plt.figure(figsize=(10, 5))
    plt.plot(metric_by_year.index, metric_by_year.values, marker='o')
    plt.title(f'{metric_name} Over Time')
    plt.xlabel('Year')
    plt.ylabel(metric_name)
    plt.grid(True)
    plt.show()

    # Step 3: Use Auto ARIMA to determine the best p, d, q parameters
    auto_model = auto_arima(metric_by_year, seasonal=False, trace=True, suppress_warnings=True)

    # Step 4: Fit the ARIMA model based on the recommended (p, d, q) values
    best_pdq = auto_model.order
    print(f"Best ARIMA order for {metric_name}: {best_pdq}")

    # Fit the model with the best order found by auto_arima
    model = ARIMA(metric_by_year, order=best_pdq)
    model_fit = model.fit()

    # Step 5: Summary of the model
    print(model_fit.summary())

    # Step 6: Forecast future metric values (next 5 years by default)
    forecast = model_fit.forecast(steps=forecast_years)

    # Step 7: Create a future index for years to forecast
    future_years = list(range(metric_by_year.index[-1] + 1, metric_by_year.index[-1] + 1 + forecast_years))

    # Step 8: Plot the forecast
    plt.figure(figsize=(10, 5))
    plt.plot(metric_by_year.index, metric_by_year.values, label=f'Historical {metric_name}')
    plt.plot(future_years, forecast, label=f'Forecasted {metric_name}', marker='o', linestyle='--')
    plt.title(f'{metric_name} Forecast')
    plt.xlabel('Year')
    plt.ylabel(metric_name)
    plt.legend()
    plt.grid(True)
    plt.show()


forecast_metric(data, 'whiff_percent', 'Whiff%', forecast_years=5)


forecast_metric(data, 'woba', 'wOBA', forecast_years=5)


forecast_metric(data, 'k_percent', 'K%', forecast_years=5)


forecast_metric(data, 'bb_percent', 'BB%', forecast_years=5)


!cp "/content/drive/MyDrive/Colab Notebooks/silverstein_time_series.ipynb" ./
!jupyter nbconvert --to html "silverstein_time_series.ipynb"

Time Series Analysis Predicting Future Pitching Performances¶

Created by Scott Silverstein¶

Questions¶

EDA¶

Summary Stats¶

Missing Values¶

Distributions of Key Statistics¶

Trends¶

Modeling¶

WHIFF %¶

wOBA¶

KK %¶

BB%¶