# Suppress
%%capture

# Import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LassoCV
from statsmodels.stats.outliers_influence import variance_inflation_factor


import warnings
warnings.filterwarnings('ignore')


# Load the dataset
file_path = '/content/fangraphs-leaderboards.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1')


# Check for missing values in all columns
missing_values_all = data.isnull().sum()

# Filter out columns that have missing values (greater than 0)
missing_values_filtered = missing_values_all[missing_values_all > 0]

# Create a table showing the missing values
missing_values_table = pd.DataFrame({
    'Column': missing_values_filtered.index,
    'Missing Values': missing_values_filtered.values
})

# Sort the table by missing values in descending order and display the top 15
missing_values_table_sorted = missing_values_table.sort_values(by='Missing Values', ascending=False)

# Display the top 15 rows
print(missing_values_table_sorted.head(15))

            Column  Missing Values
750   wFT/C (sc).2            4450
1475        xSLG.4            4450
300           SPTS            4450
100       vFT (sc)            4450
98        UN% (sc)            4450
876          xBA.2            4450
877         xSLG.2            4450
878        xwOBA.3            4450
86        FT% (sc)            4450
451   wFT/C (sc).1            4450
895            $.2            4450
896         FPTS.2            4450
897       FPTS/G.2            4450
898         SPTS.2            4450
899       SPTS/G.2            4450


# Calculate the percentage of missing data for each column
missing_percentage = data.isnull().mean()

# Set the threshold for missing data
threshold = 0.20

# Drop columns that have more than 20% missing data
data_cleaned = data.loc[:, missing_percentage <= threshold]

# Check how many columns remain after dropping
print(f"Remaining columns after dropping those with more than 10% missing data: {data_cleaned.shape[1]}")

# Show the first few rows of the cleaned data
data_cleaned.head()

Remaining columns after dropping those with more than 10% missing data: 328


# select numeric and categorical columns
numeric_columns = data_cleaned.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = data_cleaned.select_dtypes(include=['object']).columns

# imputation
data_cleaned.loc[:, numeric_columns] = data_cleaned[numeric_columns].fillna(data_cleaned[numeric_columns].median())
data_cleaned.loc[:, categorical_columns] = data_cleaned[categorical_columns].fillna(data_cleaned[categorical_columns].mode().iloc[0])

# Check if there are any remaining missing values
remaining_missing_values = data_cleaned.isnull().sum().sum()
print(f"Total remaining missing values after imputation: {remaining_missing_values}")

Total remaining missing values after imputation: 0


# Exclude the 'MLBAMID' column and select only numeric columns for the correlation matrix
numeric_columns = data_cleaned.drop(columns=['MLBAMID']).select_dtypes(include=['float64', 'int64'])

# Calculate the correlation matrix
correlation_matrix = numeric_columns.corr()

# Set a threshold for high correlation
threshold_2 = 0.95

# Select correlations above the threshold or below the negative threshold
high_corr = correlation_matrix[(correlation_matrix >= threshold_2) | (correlation_matrix <= -threshold_2)]

# Plot the heatmap of high correlations only
plt.figure(figsize=(10, 8))
sns.heatmap(high_corr, cmap='coolwarm', mask=high_corr.isnull())  # Mask NaN values
plt.title('Heatmap of High Correlations (|correlation| > 0.95)')
plt.show()


# Cluster the variables
sns.clustermap(correlation_matrix, cmap='coolwarm', annot=False, figsize=(10, 8))
plt.title('Clustered Heatmap of Correlations')
plt.show()


# Drop 'MLBAMID' and 'Name' columns first
data_cleaned_no_id = data_cleaned.drop(columns=['MLBAMID', 'Name'])

# Select only numeric columns to ensure there are no strings
data_cleaned_numeric = data_cleaned_no_id.select_dtypes(include=['float64', 'int64'])

# Calculate the correlation matrix for numeric columns only
corr_matrix_full = data_cleaned_numeric.corr().abs()

# Select upper triangle of correlation matrix
upper_full = corr_matrix_full.where(np.triu(np.ones(corr_matrix_full.shape), k=1).astype(bool))

# Find features with correlation greater than 0.9
to_drop_full = [column for column in upper_full.columns if any(upper_full[column] > 0.9)]

# Drop these columns from the full dataset before calculating VIF
data_cleaned_reduced = data_cleaned_numeric.drop(columns=to_drop_full)

#function
def calculate_vif(df):
    vif = pd.DataFrame()
    vif['Variable'] = df.columns
    vif['VIF'] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    return vif

# Calculate VIF on the reduced cleaned dataset
vif_data_reduced = calculate_vif(data_cleaned_reduced)

# Display the reduced VIF values
print(vif_data_reduced)

     Variable           VIF
0           G  2.068593e+02
1          HR  3.800921e+01
2          SB  1.654689e+01
3         BB%  7.572503e+02
4          K%  7.572423e+01
5         ISO  5.505771e+01
6       BABIP  3.398217e+02
7         AVG  8.792624e+03
8         OBP  1.571361e+04
9         BsR  4.415162e+00
10        Off  1.169821e+02
11        Def           inf
12        WAR  3.554028e+02
13     Season  6.727782e+02
14        Age  1.658026e+02
15         3B  1.476968e+01
16         BB  4.995195e+01
17         SO  4.014212e+01
18        HBP  8.130748e+00
19         SH  4.244382e+00
20         CS  6.500553e+00
21       BB/K  1.000479e+01
22        Fld           inf
23        Pos           inf
24        Spd  3.193048e+01
25        wSB  7.014029e+00
26         Lg  1.481648e+01
27       TTO%  8.607953e+01
28       BB%+  6.922229e+01
29        K%+  2.325885e+01
30     Events  2.596641e+00
31  wOppTeamV  2.251800e+15
32     wTeamV  7.505999e+14
33   wNetBatV  9.007199e+14
34         TG  4.180905e+01
35   PlayerId  9.417390e+00


# Drop any columns related to 'wRC+' (including variations like 'wRC+.1', 'wRC+.2', etc.)
data_cleaned_no_id = data_cleaned.drop(columns=[col for col in data_cleaned.columns if 'wRC+' in col])

# Ensure only numeric columns are used for X
X = data_cleaned_no_id.select_dtypes(include=['float64', 'int64'])  # Keep only numeric columns
y = data_cleaned['wRC+']  # Target variable remains the same

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Initialize the Lasso model with alpha=0.1
lasso = Lasso(alpha=0.1)

# Fit the model on the training data
lasso.fit(X_train, y_train)

# Make predictions on the test data
y_pred_lasso = lasso.predict(X_test)

# Evaluate the model
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

# Display the results
print(f"Lasso (alpha=0.1) - Mean Squared Error: {round(mse_lasso, 4)}, R² Score: {round(r2_lasso, 4)}")

Lasso (alpha=0.1) - Mean Squared Error: 7.5003, R² Score: 0.987


# Get the coefficients from the Lasso model
coef = pd.Series(lasso.coef_, index=X_train.columns)

# Filter out features with zero coefficients (which Lasso has eliminated)
non_zero_coef = coef[coef != 0]

# Sort the non-zero coefficients by their absolute values in descending order
top_features = non_zero_coef.abs().sort_values(ascending=False).head(15)

# Display the top features
print("Top Features Selected by Lasso (alpha=0.1):")
print(top_features)

Top Features Selected by Lasso (alpha=0.1):
OBP+      0.725933
SLG+.1    0.268764
SLG+.2    0.241695
OBP+.1    0.239472
SLG+      0.219312
SLG+.3    0.210515
SLG+.4    0.184540
AVG+.4    0.130021
WAR.3     0.116953
WAR.2     0.116650
WAR.4     0.112014
AVG+.3    0.108920
WAR.5     0.107852
OBP+.2    0.092020
WAR.1     0.074377
dtype: float64


# Initialize Ridge regression with an alpha value
ridge = Ridge(alpha=0.1)

# Fit the model on the training data
ridge.fit(X_train, y_train)

# Make predictions on the test data
y_pred_ridge = ridge.predict(X_test)

# Evaluate the model
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

# Display the results
print(f"Ridge (alpha=0.1) - Mean Squared Error: {round(mse_ridge, 4)}, R² Score: {round(r2_ridge, 4)}")

Ridge (alpha=0.1) - Mean Squared Error: 6.0645, R² Score: 0.9895


!cp "/content/drive/MyDrive/Colab Notebooks/silverstein_penalized_regression.ipynb" ./
!jupyter nbconvert --to html "silverstein_penalized_regression.ipynb"

	Name	G	PA	HR	R	RBI	SB	BB%	K%	ISO	...	Events.4	L-WAR.4	wOppTeamV.4	wTeamV.4	wNetBatV.4	TG.4	Bats.4	NameASCII	PlayerId	MLBAMID
0	Trea Turner	1125	4967	171	776	572	279.0	0.068452	0.185424	0.185201	...	3670	39.062987	0.179598	0.000000	0.179598	1527	R	Trea Turner	16252	607208.0
1	Kevin Kiermaier	1159	4040	95	499	378	132.0	0.068564	0.224752	0.156165	...	2547	25.809922	0.000000	0.000000	0.000000	1851	L	Kevin Kiermaier	11038	595281.0
2	Billy Hamilton	951	3285	24	454	189	326.0	0.070015	0.215525	0.086680	...	1868	11.724086	0.000000	0.000000	0.000000	1622	B	Billy Hamilton	10199	571740.0
3	Christian Yelich	1466	6383	204	945	748	205.0	0.120633	0.216356	0.178500	...	3562	43.777566	0.229549	-0.041572	0.187977	1842	L	Christian Yelich	11477	592885.0
4	José Ramírez	1451	6086	255	898	864	243.0	0.098587	0.119619	0.224847	...	4494	52.777648	0.181303	-0.187837	-0.006534	1839	B	Jose Ramirez	13510	608070.0

Which Stats Are Actually Important¶

A Penalized Regression to Determine which Hitting Stats Fans Should Pay Attention To.¶

Created by Scott Silverstein¶

Questions¶

Packages¶

EDA¶

Missing Values¶

Correlation Matrix¶

Cluster Map¶

VIF¶

Modeling¶

Preparing for modeling with Train test¶

Lasso¶

Top Features¶

Ridge¶