%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, classification_report
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=Warning)

# Load the datasets
batting_df = pd.read_csv('/content/statcast_batting.csv')
pitching_df = pd.read_csv('/content/pitching_statcast.csv')
contracts_df = pd.read_csv('/content/contracts.csv')

# Split "last_name, first_name" in the batting and pitching Statcast data
batting_df[['last_name', 'first_name']] = batting_df['last_name, first_name'].str.split(', ', expand=True)
pitching_df[['last_name', 'first_name']] = pitching_df['last_name, first_name'].str.split(', ', expand=True)

# Drop the original combined name columns
batting_df.drop(columns=['last_name, first_name'], inplace=True)
pitching_df.drop(columns=['last_name, first_name'], inplace=True)

# Update pitching positions in contracts_df to a general "pitcher" category
pitching_positions = ['rhp-s', 'lhp-s', 'rhp-c', 'lhp-c', 'rhp', 'lhp']
contracts_df['position'] = contracts_df['position'].replace(dict.fromkeys(pitching_positions, 'pitcher'))

# Split contracts into pitching and batting based on the updated position
pitching_contracts = contracts_df[contracts_df['position'] == 'pitcher']
batting_contracts = contracts_df[contracts_df['position'] != 'pitcher']

# Merge pitching contracts with pitching Statcast data
merged_pitching = pd.merge(pitching_contracts, pitching_df, on=['first_name', 'last_name', 'year'], how='inner')

# Merge batting contracts with batting Statcast data
merged_batting = pd.merge(batting_contracts, batting_df, on=['first_name', 'last_name', 'year'], how='inner')

# Display the first few rows of each merged dataset to confirm
print("Merged Pitching Dataset:")
print(merged_pitching.head())

print("\nMerged Batting Dataset:")
print(merged_batting.head())

Merged Pitching Dataset:
   Unnamed: 0 first_name last_name       team  year position     age  \
0           7    Framber    Valdez    Houston  2024  pitcher      30   
1          22     Hunter     Brown    Houston  2024  pitcher      25   
2          28      Ronel    Blanco    Houston  2024  pitcher      30   
3          37      Shane    Bieber  Cleveland  2023  pitcher  28.031   
4          44      Aaron    Civale  Cleveland  2023  pitcher  28.019   

   service time           agent    value  ...  n_ff_formatted  ff_avg_speed  \
0         4.163         Octagon  12.1000  ...             1.2          94.5   
1         1.035             NaN   0.7745  ...            34.7          96.0   
2         0.101             NaN   0.7498  ...            38.3          93.4   
3         4.097  Rosenhaus Spts  10.0100  ...            35.2          91.3   
4         3.058             NaN   2.6000  ...            12.3          91.8   

   ff_avg_spin  n_sl_formatted  sl_avg_speed  n_ch_formatted  ch_avg_speed  \
0       2152.0             4.3          85.1            17.5          89.9   
1       2297.0             5.2          89.2            12.6          88.4   
2       2228.0            30.2          86.4            22.1          85.2   
3       2242.0            20.7          84.6             3.5          87.2   
4       2384.0             5.8          82.5             NaN           NaN   

   n_cu_formatted  cu_avg_speed  cu_avg_spin  
0            31.3          79.7       2905.0  
1            12.5          82.9       2486.0  
2             9.4          80.1       2258.0  
3            13.7          82.5       2239.0  
4            24.4          78.1       2985.0  

[5 rows x 55 columns]

Merged Batting Dataset:
   Unnamed: 0 first_name last_name     team  year position age  service time  \
0           1       Alex   Bregman  Houston  2024       3b  30         7.070   
1           2       Jose    Altuve  Houston  2024       2b  34        12.072   
2          10     Yordan   Alvarez  Houston  2024    lf-dh  27         4.113   
3          21     Jeremy      Peña  Houston  2024       ss  26         2.000   
4          23     Yainer      Diaz  Houston  2024        c  25         1.035   

         agent      value  ...  hard_hit_percent  avg_best_speed  \
0  Boras Corp.  30.500000  ...              40.5       98.986528   
1  Boras Corp.  29.200000  ...              31.2       97.323673   
2   MVP Sports  10.833333  ...              49.7      104.097105   
3          NaN   0.783500  ...              38.8       99.323507   
4          NaN   0.768900  ...              47.5      101.297056   

   avg_hyper_speed  z_swing_percent  z_swing_miss_percent  oz_swing_percent  \
0        93.684161             65.6                   8.7              23.6   
1        92.693763             68.6                  12.6              37.3   
2        96.703520             68.0                  11.1              30.5   
3        93.741684             72.9                  11.4              36.9   
4        95.034360             77.7                  13.6              42.6   

   oz_swing_miss_percent  oz_contact_percent  whiff_percent  swing_percent  
0                   24.7                75.3           12.8           44.9  
1                   36.1                62.7           21.9           51.5  
2                   36.5                63.5           19.9           47.8  
3                   49.4                49.7           24.9           54.2  
4                   40.2                59.8           24.0           58.8  

[5 rows x 48 columns]

# Remove the 'agent' column from both merged datasets
merged_pitching = merged_pitching.drop(columns=['agent'])
merged_batting = merged_batting.drop(columns=['agent'])

# Calculate missing values and their percentage of total data for both datasets
missing_values_pitching = merged_pitching.isnull().sum()
missing_values_batting = merged_batting.isnull().sum()

# Filter to show only columns with missing values, including their percentage
missing_values_pitching = missing_values_pitching[missing_values_pitching > 0]
missing_percentage_pitching = (missing_values_pitching / len(merged_pitching)) * 100

missing_values_batting = missing_values_batting[missing_values_batting > 0]
missing_percentage_batting = (missing_values_batting / len(merged_batting)) * 100

# Display columns with missing values, their counts, and percentage
print("Missing values in Merged Pitching Dataset:")
print(pd.DataFrame({'Missing Values': missing_values_pitching, 'Percentage': missing_percentage_pitching}))

print("\nMissing values in Merged Batting Dataset:")
print(pd.DataFrame({'Missing Values': missing_values_batting, 'Percentage': missing_percentage_batting}))

Missing values in Merged Pitching Dataset:
                Missing Values  Percentage
n_ff_formatted              16    3.539823
ff_avg_speed                16    3.539823
ff_avg_spin                 16    3.539823
n_sl_formatted             102   22.566372
sl_avg_speed               102   22.566372
n_ch_formatted              64   14.159292
ch_avg_speed                64   14.159292
n_cu_formatted              70   15.486726
cu_avg_speed                70   15.486726
cu_avg_spin                 70   15.486726

Missing values in Merged Batting Dataset:
                    Missing Values  Percentage
avg_swing_speed                459   78.865979
fast_swing_rate                459   78.865979
blasts_contact                 459   78.865979
blasts_swing                   459   78.865979
squared_up_contact             459   78.865979

# Define threshold for missing percentage
threshold = 75

# Filter columns with missing percentage over threshold for removal
columns_to_remove_pitching = missing_percentage_pitching[missing_percentage_pitching > threshold].index
columns_to_remove_batting = missing_percentage_batting[missing_percentage_batting > threshold].index

# Drop these columns from the datasets
merged_pitching = merged_pitching.drop(columns=columns_to_remove_pitching)
merged_batting = merged_batting.drop(columns=columns_to_remove_batting)

# Display the remaining columns to confirm removal
print("Columns in Merged Pitching Dataset after removal:")
print(merged_pitching.columns)

print("\nColumns in Merged Batting Dataset after removal:")
print(merged_batting.columns)

Columns in Merged Pitching Dataset after removal:
Index(['Unnamed: 0', 'first_name', 'last_name', 'team', 'year', 'position',
       'age', 'service time', 'value', 'player_id', 'pa', 'home_run',
       'strikeout', 'k_percent', 'bb_percent', 'batting_avg', 'slg_percent',
       'p_era', 'xwoba', 'sweet_spot_percent', 'barrel', 'barrel_batted_rate',
       'solidcontact_percent', 'hard_hit_percent', 'avg_hyper_speed',
       'z_swing_percent', 'z_swing_miss_percent', 'oz_swing_percent',
       'oz_swing_miss_percent', 'oz_contact_percent', 'out_zone_swing_miss',
       'meatball_swing_percent', 'meatball_percent', 'pitch_count_offspeed',
       'pitch_count_fastball', 'pitch_count_breaking', 'pitch_count',
       'iz_contact_percent', 'in_zone_swing_miss', 'whiff_percent',
       'swing_percent', 'pitch_hand', 'n', 'arm_angle', 'n_ff_formatted',
       'ff_avg_speed', 'ff_avg_spin', 'n_sl_formatted', 'sl_avg_speed',
       'n_ch_formatted', 'ch_avg_speed', 'n_cu_formatted', 'cu_avg_speed',
       'cu_avg_spin'],
      dtype='object')

Columns in Merged Batting Dataset after removal:
Index(['Unnamed: 0', 'first_name', 'last_name', 'team', 'year', 'position',
       'age', 'service time', 'value', 'player_id', 'pa', 'hit', 'home_run',
       'walk', 'k_percent', 'bb_percent', 'batting_avg', 'slg_percent',
       'on_base_percent', 'on_base_plus_slg', 'isolated_power', 'babip',
       'b_rbi', 'woba', 'xwoba', 'wobacon', 'xwobacon', 'xbacon',
       'exit_velocity_avg', 'sweet_spot_percent', 'barrel_batted_rate',
       'solidcontact_percent', 'hard_hit_percent', 'avg_best_speed',
       'avg_hyper_speed', 'z_swing_percent', 'z_swing_miss_percent',
       'oz_swing_percent', 'oz_swing_miss_percent', 'oz_contact_percent',
       'whiff_percent', 'swing_percent'],
      dtype='object')

# Drop the "Unnamed: 0" column from both merged datasets if it exists
merged_pitching = merged_pitching.drop(columns=['Unnamed: 0'], errors='ignore')
merged_batting = merged_batting.drop(columns=['Unnamed: 0'], errors='ignore')

# Display the first few rows to confirm removal
print("Merged Pitching Dataset after removing 'Unnamed: 0':")
print(merged_pitching.head())

print("\nMerged Batting Dataset after removing 'Unnamed: 0':")
print(merged_batting.head())

Merged Pitching Dataset after removing 'Unnamed: 0':
  first_name last_name       team  year position     age  service time  \
0    Framber    Valdez    Houston  2024  pitcher      30         4.163   
1     Hunter     Brown    Houston  2024  pitcher      25         1.035   
2      Ronel    Blanco    Houston  2024  pitcher      30         0.101   
3      Shane    Bieber  Cleveland  2023  pitcher  28.031         4.097   
4      Aaron    Civale  Cleveland  2023  pitcher  28.019         3.058   

     value  player_id   pa  ...  n_ff_formatted  ff_avg_speed  ff_avg_spin  \
0  12.1000     664285  703  ...             1.2          94.5       2152.0   
1   0.7745     686613  712  ...            34.7          96.0       2297.0   
2   0.7498     669854  676  ...            38.3          93.4       2228.0   
3  10.0100     669456  533  ...            35.2          91.3       2242.0   
4   2.6000     650644  504  ...            12.3          91.8       2384.0   

   n_sl_formatted  sl_avg_speed  n_ch_formatted  ch_avg_speed  n_cu_formatted  \
0             4.3          85.1            17.5          89.9            31.3   
1             5.2          89.2            12.6          88.4            12.5   
2            30.2          86.4            22.1          85.2             9.4   
3            20.7          84.6             3.5          87.2            13.7   
4             5.8          82.5             NaN           NaN            24.4   

   cu_avg_speed  cu_avg_spin  
0          79.7       2905.0  
1          82.9       2486.0  
2          80.1       2258.0  
3          82.5       2239.0  
4          78.1       2985.0  

[5 rows x 53 columns]

Merged Batting Dataset after removing 'Unnamed: 0':
  first_name last_name     team  year position age  service time      value  \
0       Alex   Bregman  Houston  2024       3b  30         7.070  30.500000   
1       Jose    Altuve  Houston  2024       2b  34        12.072  29.200000   
2     Yordan   Alvarez  Houston  2024    lf-dh  27         4.113  10.833333   
3     Jeremy      Peña  Houston  2024       ss  26         2.000   0.783500   
4     Yainer      Diaz  Houston  2024        c  25         1.035   0.768900   

   player_id   pa  ...  hard_hit_percent  avg_best_speed  avg_hyper_speed  \
0     608324  634  ...              40.5       98.986528        93.684161   
1     514888  682  ...              31.2       97.323673        92.693763   
2     670541  635  ...              49.7      104.097105        96.703520   
3     665161  650  ...              38.8       99.323507        93.741684   
4     673237  619  ...              47.5      101.297056        95.034360   

   z_swing_percent  z_swing_miss_percent  oz_swing_percent  \
0             65.6                   8.7              23.6   
1             68.6                  12.6              37.3   
2             68.0                  11.1              30.5   
3             72.9                  11.4              36.9   
4             77.7                  13.6              42.6   

   oz_swing_miss_percent  oz_contact_percent  whiff_percent  swing_percent  
0                   24.7                75.3           12.8           44.9  
1                   36.1                62.7           21.9           51.5  
2                   36.5                63.5           19.9           47.8  
3                   49.4                49.7           24.9           54.2  
4                   40.2                59.8           24.0           58.8  

[5 rows x 41 columns]

# Function to impute missing values based on skewness
def impute_missing_data(df):
    for column in df.columns:
        if df[column].isnull().sum() > 0:  # Check if there are missing values
            # Calculate skewness
            column_skewness = skew(df[column].dropna())

            # Impute based on skewness
            if column_skewness > 0.5:  # Right-skewed, use median
                df[column].fillna(df[column].median(), inplace=True)
            else:  # Fairly normal or left-skewed, use mean
                df[column].fillna(df[column].mean(), inplace=True)

# Apply imputation to both merged datasets
impute_missing_data(merged_pitching)
impute_missing_data(merged_batting)

# Check if any missing values remain in both datasets
missing_after_imputation_pitching = merged_pitching.isnull().sum().sum()
missing_after_imputation_batting = merged_batting.isnull().sum().sum()

print("Remaining missing values in Merged Pitching Dataset:", missing_after_imputation_pitching)
print("Remaining missing values in Merged Batting Dataset:", missing_after_imputation_batting)

Remaining missing values in Merged Pitching Dataset: 0
Remaining missing values in Merged Batting Dataset: 0

# 1. Distribution of Target Variable (e.g., Salary Value)
plt.figure(figsize=(8, 6))
sns.histplot(merged_batting['value'], kde=True)
plt.title("Distribution of Salary in Hitting Dataset")
plt.xlabel("Salary (Value)")
plt.ylabel("Frequency")
plt.show()

# Confirm there are no missing values in the 'value' column before proceeding
print("Missing values in 'value' column before creating 'top_earner':", merged_batting['value'].isnull().sum())

# Re-calculate the 75th percentile salary as the threshold for top earners
top_25_salary = merged_batting['value'].quantile(0.75)

# Create binary salary variable: 1 for top 25% earners, 0 for others
# This ensures that no NaNs are introduced by using >= to include exact 75th percentile salaries
merged_batting['top_earner'] = (merged_batting['value'] >= top_25_salary).astype(int)

# Check for any NaNs in the 'top_earner' column after creation
print("Missing values in 'top_earner' after creation:", merged_batting['top_earner'].isnull().sum())

Missing values in 'value' column before creating 'top_earner': 0
Missing values in 'top_earner' after creation: 0

# 1. Distribution of Target Variable (e.g., Salary Value)
plt.figure(figsize=(8, 6))
sns.histplot(merged_pitching['value'], kde=True)
plt.title("Distribution of Salary in Pitching Dataset")
plt.xlabel("Salary (Value)")
plt.ylabel("Frequency")
plt.show()

# Confirm there are no missing values in the 'value' column before proceeding
print("Missing values in 'value' column before creating 'top_earner':", merged_pitching['value'].isnull().sum())

# Re-calculate the 75th percentile salary as the threshold for top earners
top_25_salary = merged_pitching['value'].quantile(0.75)

# Create binary salary variable: 1 for top 25% earners, 0 for others
# This ensures that no NaNs are introduced by using >= to include exact 75th percentile salaries
merged_pitching['top_earner'] = (merged_pitching['value'] >= top_25_salary).astype(int)

# Check for any NaNs in the 'top_earner' column after creation
print("Missing values in 'top_earner' after creation:", merged_pitching['top_earner'].isnull().sum())

Missing values in 'value' column before creating 'top_earner': 0
Missing values in 'top_earner' after creation: 0

# 1. Drop non-informative columns
non_informative_cols = ['first_name', 'last_name', 'team', 'position', 'year', 'age','player_id','service time', 'cu_avg_speed']
merged_batting = merged_batting.drop(columns=[col for col in non_informative_cols if col in merged_batting.columns], errors='ignore')
merged_pitching = merged_pitching.drop(columns=[col for col in non_informative_cols if col in merged_pitching.columns], errors='ignore')

# Ensure target variable and features have no missing values
merged_batting = merged_batting.dropna(subset=['top_earner'])
X_batting = merged_batting.drop(columns=['top_earner', 'value'])
y_batting = merged_batting['top_earner']

# Confirm there are no remaining missing values in X and y
print("Missing values in X_batting:", X_batting.isnull().sum().sum())
print("Missing values in y_batting:", y_batting.isnull().sum())

# Split the data into training and test sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_batting, y_batting, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test)
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
f1 = f1_score(y_test, y_pred)

print("Random Forest Batting Model Performance:")
print(f"Accuracy: {accuracy:.2f}")
print(f"ROC AUC Score: {roc_auc:.2f}")
print(f"F1 Score: {f1:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature Importance
feature_importances = rf_model.feature_importances_
features = X_batting.columns

# Plot feature importance
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df.head(10))  # Top 10 important features
plt.title("Top 10 Feature Importances from Random Forest")
plt.show()

Missing values in X_batting: 0
Missing values in y_batting: 0
Random Forest Batting Model Performance:
Accuracy: 0.75
ROC AUC Score: 0.72
F1 Score: 0.29

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.93      0.85        88
           1       0.50      0.21      0.29        29

    accuracy                           0.75       117
   macro avg       0.64      0.57      0.57       117
weighted avg       0.71      0.75      0.71       117

# Define thresholds to experiment with
thresholds = [0.005, 0.01, 0.02]
performance_metrics = []

# Split the data into training and test sets (e.g., 80% train, 20% test)
X_batting = merged_batting.drop(columns=['top_earner', 'value'])
y_batting = merged_batting['top_earner']
X_train_full, X_test, y_train_full, y_test = train_test_split(X_batting, y_batting, test_size=0.2, random_state=42)

# Loop through each threshold
for threshold in thresholds:
    # Filter features based on the importance threshold from Random Forest
    important_features = importance_df[importance_df['Importance'] >= threshold]['Feature'].tolist()
    X_train_selected = X_train_full[important_features]

    # Cross-validation to measure performance on the training data with the selected features
    gb_model = GradientBoostingClassifier(random_state=42)
    cv_scores = cross_val_score(gb_model, X_train_selected, y_train_full, cv=5, scoring='roc_auc')
    avg_cv_score = np.mean(cv_scores)

    # Fit Gradient Boosting on the full training set with selected features
    gb_model.fit(X_train_selected, y_train_full)

    # Store performance metrics for each threshold
    performance_metrics.append({
        'Threshold': threshold,
        'Features Used': len(important_features),
        'Average CV ROC AUC': avg_cv_score
    })

# Convert to DataFrame for easy viewing
performance_df = pd.DataFrame(performance_metrics)
print("Performance on Training Data (using Cross-Validation):")
print(performance_df)

# Choose the threshold that provided the best cross-validation performance
best_threshold = performance_df.loc[performance_df['Average CV ROC AUC'].idxmax(), 'Threshold']
best_features = importance_df[importance_df['Importance'] >= best_threshold]['Feature'].tolist()
print(f"\nBest threshold: {best_threshold} with features: {best_features}")

# Final Model Training and Evaluation on Test Set with the Best Threshold
X_train_best = X_train_full[best_features]
X_test_best = X_test[best_features]

# Train Gradient Boosting on the entire training data with the best feature subset
gb_final_model = GradientBoostingClassifier(random_state=42)
gb_final_model.fit(X_train_best, y_train_full)

# Predict on the test set
y_pred = gb_final_model.predict(X_test_best)
y_pred_proba = gb_final_model.predict_proba(X_test_best)[:, 1]

# Evaluate the final model on the test set
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
f1 = f1_score(y_test, y_pred)

print("\nFinal Gradient Boosting Model Performance on Test Set:")
print(f"Accuracy: {accuracy:.2f}")
print(f"ROC AUC Score: {roc_auc:.2f}")
print(f"F1 Score: {f1:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Performance on Training Data (using Cross-Validation):
   Threshold  Features Used  Average CV ROC AUC
0      0.005             32            0.691874
1      0.010             32            0.691874
2      0.020             32            0.691874

Best threshold: 0.005 with features: ['xwoba', 'b_rbi', 'hard_hit_percent', 'pa', 'bb_percent', 'on_base_plus_slg', 'exit_velocity_avg', 'solidcontact_percent', 'avg_hyper_speed', 'woba', 'xwobacon', 'k_percent', 'on_base_percent', 'avg_best_speed', 'oz_swing_percent', 'walk', 'swing_percent', 'hit', 'babip', 'isolated_power', 'oz_swing_miss_percent', 'z_swing_percent', 'wobacon', 'batting_avg', 'barrel_batted_rate', 'oz_contact_percent', 'slg_percent', 'whiff_percent', 'z_swing_miss_percent', 'sweet_spot_percent', 'xbacon', 'home_run']

Final Gradient Boosting Model Performance on Test Set:
Accuracy: 0.78
ROC AUC Score: 0.70
F1 Score: 0.41

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.93      0.86        88
           1       0.60      0.31      0.41        29

    accuracy                           0.78       117
   macro avg       0.70      0.62      0.64       117
weighted avg       0.75      0.78      0.75       117

# Get feature importances from the trained Gradient Boosting model
feature_importances = gb_final_model.feature_importances_
features = X_train_best.columns

# Create a DataFrame for feature importances
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# 1. Bar Plot of Top 10 Feature Importances
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df.head(10))
plt.title("Top 10 Feature Importances from Gradient Boosting")
plt.show()

# 2. Cumulative Feature Importance Plot
importance_df['Cumulative Importance'] = importance_df['Importance'].cumsum()
plt.figure(figsize=(10, 6))
plt.plot(importance_df['Cumulative Importance'], marker='o')
plt.xlabel('Number of Features')
plt.ylabel('Cumulative Importance')
plt.title("Cumulative Feature Importance")
plt.axhline(y=0.9, color='r', linestyle='--', label='90% Cumulative Importance')
plt.legend()
plt.show()

def evaluate_xgboost_with_smote(X, y, importance_df, thresholds=[0.005, 0.01, 0.02]):
    performance_metrics = []

    # Split data into training and test sets
    X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Apply SMOTE to the training set for balancing
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_full, y_train_full)

    # Ensure `important_features` only includes columns present in X_resampled
    available_features = X_resampled.columns.intersection(importance_df['Feature'])

    # Loop through each threshold
    for threshold in thresholds:
        # Filter features based on the importance threshold from the importance_df
        important_features = importance_df[importance_df['Importance'] >= threshold]['Feature']
        important_features = important_features[important_features.isin(available_features)].tolist()

        # Select only the important features for X_resampled
        X_train_selected = X_resampled[important_features]

        # Cross-validation to measure performance on the training data with the selected features
        xgb_model = XGBClassifier(random_state=42, scale_pos_weight=(y_train_full.value_counts()[0] / y_train_full.value_counts()[1]))
        cv_scores = cross_val_score(xgb_model, X_train_selected, y_resampled, cv=5, scoring='roc_auc')
        avg_cv_score = np.mean(cv_scores)

        # Fit XGBoost on the resampled training set with selected features
        xgb_model.fit(X_train_selected, y_resampled)

        # Store performance metrics for each threshold
        performance_metrics.append({
            'Threshold': threshold,
            'Features Used': len(important_features),
            'Average CV ROC AUC': avg_cv_score
        })

    # Convert to DataFrame for easy viewing
    performance_df = pd.DataFrame(performance_metrics)
    print("Performance on Training Data (using Cross-Validation):")
    print(performance_df)

    # Choose the threshold that provided the best cross-validation performance
    best_threshold = performance_df.loc[performance_df['Average CV ROC AUC'].idxmax(), 'Threshold']
    best_features = importance_df[importance_df['Importance'] >= best_threshold]['Feature']
    best_features = best_features[best_features.isin(available_features)].tolist()
    print(f"\nBest threshold: {best_threshold} with features: {best_features}")

    # Final Model Training and Evaluation on Test Set with the Best Threshold
    X_train_best = X_resampled[best_features]
    X_test_best = X_test[best_features]

    # Train XGBoost on the entire training data with the best feature subset
    xgb_final_model = XGBClassifier(random_state=42, scale_pos_weight=(y_train_full.value_counts()[0] / y_train_full.value_counts()[1]))
    xgb_final_model.fit(X_train_best, y_resampled)

    # Predict on the test set
    y_pred = xgb_final_model.predict(X_test_best)
    y_pred_proba = xgb_final_model.predict_proba(X_test_best)[:, 1]

    # Evaluate the final model on the test set
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    f1 = f1_score(y_test, y_pred)

    print("\nFinal XGBoost Model with SMOTE Performance on Test Set:")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"ROC AUC Score: {roc_auc:.2f}")
    print(f"F1 Score: {f1:.2f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

# Prepare the batting dataset
X_batting = merged_batting.drop(columns=['top_earner', 'value'])
y_batting = merged_batting['top_earner']

# Call the evaluation function for the batting dataset
evaluate_xgboost_with_smote(X_batting, y_batting, importance_df)

Performance on Training Data (using Cross-Validation):
   Threshold  Features Used  Average CV ROC AUC
0      0.005             13            0.885844
1      0.010             13            0.885844
2      0.020             13            0.885844

Best threshold: 0.005 with features: ['barrel_batted_rate', 'solidcontact_percent', 'k_percent', 'xwoba', 'oz_contact_percent', 'avg_hyper_speed', 'bb_percent', 'sweet_spot_percent', 'swing_percent', 'oz_swing_percent', 'home_run', 'z_swing_percent', 'batting_avg']

Final XGBoost Model with SMOTE Performance on Test Set:
Accuracy: 0.71
ROC AUC Score: 0.74
F1 Score: 0.53

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.73      0.79        88
           1       0.44      0.66      0.53        29

    accuracy                           0.71       117
   macro avg       0.65      0.69      0.66       117
weighted avg       0.76      0.71      0.73       117

# Extract feature importances from the trained XGBoost model
feature_importances = xgb_final_model.feature_importances_
features = X_train_best.columns  # Use the feature names from the selected feature set

# Create a DataFrame for feature importances
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Plot the top 10 feature importances
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df.head(10))
plt.title("Top 10 Feature Importances from XGBoost with SMOTE - Batting")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()

# Ensure target variable and features have no missing values

merged_pitching = merged_pitching.dropna(subset=['top_earner'])

# Define X (features) and y (target)
X_pitching = merged_pitching.drop(columns=['top_earner', 'value'])
y_pitching = merged_pitching['top_earner']  # Define the target variable

# Check for non-numeric columns in X_pitching
non_numeric_columns = X_pitching.select_dtypes(exclude=['number']).columns


# Drop non-numeric columns to ensure only numeric data remains
X_pitching = X_pitching.select_dtypes(include=['number'])


# Re-run the train-test split with the cleaned data
X_train, X_test, y_train, y_test = train_test_split(X_pitching, y_pitching, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test)
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
f1 = f1_score(y_test, y_pred)

print("Random Forest Pitching Model Performance:")
print(f"Accuracy: {accuracy:.2f}")
print(f"ROC AUC Score: {roc_auc:.2f}")
print(f"F1 Score: {f1:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature Importance
feature_importances = rf_model.feature_importances_
features = X_pitching.columns

# Plot feature importance
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df.head(10))  # Top 10 important features
plt.title("Top 10 Feature Importances from Random Forest - Pitching")
plt.show()

Random Forest Pitching Model Performance:
Accuracy: 0.66
ROC AUC Score: 0.60
F1 Score: 0.06

Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.98      0.79        60
           1       0.50      0.03      0.06        31

    accuracy                           0.66        91
   macro avg       0.58      0.51      0.43        91
weighted avg       0.61      0.66      0.54        91

# Define thresholds to experiment with
thresholds = [0.005, 0.01, 0.02]
performance_metrics = []

# Split the data into training and test sets (e.g., 80% train, 20% test)
X_pitching = merged_pitching.drop(columns=['top_earner', 'value'])
y_pitching = merged_pitching['top_earner']
X_train_full, X_test, y_train_full, y_test = train_test_split(X_pitching, y_pitching, test_size=0.2, random_state=42)

# Loop through each threshold
for threshold in thresholds:
    # Filter features based on the importance threshold from Random Forest
    important_features = importance_df[importance_df['Importance'] >= threshold]['Feature'].tolist()
    X_train_selected = X_train_full[important_features]

    # Cross-validation to measure performance on the training data with the selected features
    gb_model = GradientBoostingClassifier(random_state=42)
    cv_scores = cross_val_score(gb_model, X_train_selected, y_train_full, cv=5, scoring='roc_auc')
    avg_cv_score = np.mean(cv_scores)

    # Fit Gradient Boosting on the full training set with selected features
    gb_model.fit(X_train_selected, y_train_full)

    # Store performance metrics for each threshold
    performance_metrics.append({
        'Threshold': threshold,
        'Features Used': len(important_features),
        'Average CV ROC AUC': avg_cv_score
    })

# Convert to DataFrame for easy viewing
performance_df = pd.DataFrame(performance_metrics)
print("Performance on Training Data (using Cross-Validation):")
print(performance_df)

# Choose the threshold that provided the best cross-validation performance
best_threshold = performance_df.loc[performance_df['Average CV ROC AUC'].idxmax(), 'Threshold']
best_features = importance_df[importance_df['Importance'] >= best_threshold]['Feature'].tolist()
print(f"\nBest threshold: {best_threshold} with features: {best_features}")

# Final Model Training and Evaluation on Test Set with the Best Threshold
X_train_best = X_train_full[best_features]
X_test_best = X_test[best_features]

# Train Gradient Boosting on the entire training data with the best feature subset
gb_final_model = GradientBoostingClassifier(random_state=42)
gb_final_model.fit(X_train_best, y_train_full)

# Predict on the test set
y_pred = gb_final_model.predict(X_test_best)
y_pred_proba = gb_final_model.predict_proba(X_test_best)[:, 1]

# Evaluate the final model on the test set
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
f1 = f1_score(y_test, y_pred)

print("\nFinal Gradient Boosting Model Performance on Test Set:")
print(f"Accuracy: {accuracy:.2f}")
print(f"ROC AUC Score: {roc_auc:.2f}")
print(f"F1 Score: {f1:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Performance on Training Data (using Cross-Validation):
   Threshold  Features Used  Average CV ROC AUC
0      0.005             42            0.602759
1      0.010             42            0.602759
2      0.020             30            0.632103

Best threshold: 0.02 with features: ['pitch_count_fastball', 'batting_avg', 'arm_angle', 'bb_percent', 'ff_avg_speed', 'n_cu_formatted', 'n_ch_formatted', 'cu_avg_spin', 'ff_avg_spin', 'ch_avg_speed', 'n', 'pa', 'n_sl_formatted', 'pitch_count', 'barrel', 'solidcontact_percent', 'swing_percent', 'z_swing_percent', 'avg_hyper_speed', 'z_swing_miss_percent', 'strikeout', 'iz_contact_percent', 'whiff_percent', 'oz_swing_miss_percent', 'pitch_count_offspeed', 'pitch_count_breaking', 'oz_contact_percent', 'out_zone_swing_miss', 'oz_swing_percent', 'n_ff_formatted']

Final Gradient Boosting Model Performance on Test Set:
Accuracy: 0.66
ROC AUC Score: 0.61
F1 Score: 0.11

Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.97      0.79        60
           1       0.50      0.06      0.11        31

    accuracy                           0.66        91
   macro avg       0.58      0.52      0.45        91
weighted avg       0.61      0.66      0.56        91

# Extract feature importances and sort them in descending order
feature_importances = gb_final_model.feature_importances_
features = X_train_best.columns

# Create a DataFrame for feature importances
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# 1. Bar Plot of Top 10 Feature Importances
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df.head(10))
plt.title("Top 10 Feature Importances from Gradient Boosting - Pitching")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()

# 2. Cumulative Feature Importance Plot
importance_df['Cumulative Importance'] = importance_df['Importance'].cumsum()
plt.figure(figsize=(10, 6))
plt.plot(importance_df['Cumulative Importance'], marker='o', color='b')
plt.xlabel("Number of Features")
plt.ylabel("Cumulative Importance")
plt.title("Cumulative Feature Importance - Pitching")
plt.axhline(y=0.9, color='r', linestyle='--', label='90% Cumulative Importance')
plt.legend()
plt.show()

# Check for non-numeric columns in X_pitching
non_numeric_columns = X_pitching.select_dtypes(exclude=['number']).columns

# Drop non-numeric columns (categorical) from X_pitching
X_pitching_numeric = X_pitching.select_dtypes(include=['number'])

# Now call the evaluation function with the numeric-only dataset
evaluate_xgboost_with_smote(X_pitching_numeric, y_pitching, importance_df)

Performance on Training Data (using Cross-Validation):
   Threshold  Features Used  Average CV ROC AUC
0      0.005             13            0.889612
1      0.010             13            0.889612
2      0.020             13            0.889612

Best threshold: 0.005 with features: ['barrel_batted_rate', 'solidcontact_percent', 'k_percent', 'xwoba', 'oz_contact_percent', 'avg_hyper_speed', 'bb_percent', 'sweet_spot_percent', 'swing_percent', 'oz_swing_percent', 'home_run', 'z_swing_percent', 'batting_avg']

Final XGBoost Model with SMOTE Performance on Test Set:
Accuracy: 0.54
ROC AUC Score: 0.57
F1 Score: 0.30

Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.67      0.66        60
           1       0.31      0.29      0.30        31

    accuracy                           0.54        91
   macro avg       0.48      0.48      0.48        91
weighted avg       0.53      0.54      0.53        91

# Define thresholds to experiment with
thresholds = [0.005, 0.01, 0.02]
performance_metrics = []

# Split data into training and test sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X_pitching, y_pitching, test_size=0.2, random_state=42)

# Drop non-numeric columns from X_train_full
non_numeric_columns = X_train_full.select_dtypes(exclude=['number']).columns
X_train_full = X_train_full.drop(columns=non_numeric_columns)
X_test = X_test.drop(columns=non_numeric_columns)

# Apply SMOTE to the training set for balancing
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_full, y_train_full)

# Ensure `important_features` only includes columns present in X_resampled
available_features = X_resampled.columns.intersection(importance_df['Feature'])

# Loop through each threshold
for threshold in thresholds:
    # Filter features based on the importance threshold from the importance_df
    important_features = importance_df[importance_df['Importance'] >= threshold]['Feature']
    important_features = important_features[important_features.isin(available_features)].tolist()

    # Select only the important features for X_resampled
    X_train_selected = X_resampled[important_features]

    # Cross-validation to measure performance on the training data with the selected features
    gb_model = GradientBoostingClassifier(random_state=42)
    cv_scores = cross_val_score(gb_model, X_train_selected, y_resampled, cv=5, scoring='roc_auc')
    avg_cv_score = np.mean(cv_scores)

    # Fit Gradient Boosting on the resampled training set with selected features
    gb_model.fit(X_train_selected, y_resampled)

    # Store performance metrics for each threshold
    performance_metrics.append({
        'Threshold': threshold,
        'Features Used': len(important_features),
        'Average CV ROC AUC': avg_cv_score
    })

# Convert to DataFrame for easy viewing
performance_df = pd.DataFrame(performance_metrics)
print("Performance on Training Data (using Cross-Validation):")
print(performance_df)

# Choose the threshold that provided the best cross-validation performance
best_threshold = performance_df.loc[performance_df['Average CV ROC AUC'].idxmax(), 'Threshold']
best_features = importance_df[importance_df['Importance'] >= best_threshold]['Feature']
best_features = best_features[best_features.isin(available_features)].tolist()
print(f"\nBest threshold: {best_threshold} with features: {best_features}")

# Final Model Training and Evaluation on Test Set with the Best Threshold
X_train_best = X_resampled[best_features]
X_test_best = X_test[best_features]

# Train Gradient Boosting on the entire training data with the best feature subset
gb_final_model = GradientBoostingClassifier(random_state=42)
gb_final_model.fit(X_train_best, y_resampled)

# Predict on the test set
y_pred = gb_final_model.predict(X_test_best)
y_pred_proba = gb_final_model.predict_proba(X_test_best)[:, 1]

# Evaluate the final model on the test set
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
f1 = f1_score(y_test, y_pred)

print("\nFinal Gradient Boosting Model with SMOTE Performance on Test Set:")
print(f"Accuracy: {accuracy:.2f}")
print(f"ROC AUC Score: {roc_auc:.2f}")
print(f"F1 Score: {f1:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Performance on Training Data (using Cross-Validation):
   Threshold  Features Used  Average CV ROC AUC
0      0.005             13             0.85054
1      0.010             13             0.85054
2      0.020             13             0.85054

Best threshold: 0.005 with features: ['barrel_batted_rate', 'solidcontact_percent', 'k_percent', 'xwoba', 'oz_contact_percent', 'avg_hyper_speed', 'bb_percent', 'sweet_spot_percent', 'swing_percent', 'oz_swing_percent', 'home_run', 'z_swing_percent', 'batting_avg']

Final Gradient Boosting Model with SMOTE Performance on Test Set:
Accuracy: 0.62
ROC AUC Score: 0.58
F1 Score: 0.41

Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.73      0.72        60
           1       0.43      0.39      0.41        31

    accuracy                           0.62        91
   macro avg       0.56      0.56      0.56        91
weighted avg       0.61      0.62      0.61        91

# Get feature importances from the model
feature_importances = gb_final_model.feature_importances_

# Create a DataFrame for easy plotting
importance_df = pd.DataFrame({'Feature': best_features, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Plot the top 10 most important features
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df.head(10))
plt.title("Top 10 Feature Importances from Gradient Boosting Model with SMOTE - Pitching")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()

!cp "/content/drive/MyDrive/Colab Notebooks/Silverstein_Ensmble_Model_Assignment.ipynb" ./
!jupyter nbconvert --to html "Silverstein_Ensmble_Model_Assignment.ipynb"

[NbConvertApp] Converting notebook Silverstein_Ensmble_Model_Assignment.ipynb to html
[NbConvertApp] WARNING | Alternative text is missing on 10 image(s).
[NbConvertApp] Writing 946568 bytes to Silverstein_Ensmble_Model_Assignment.html

Are Players Getting Paid Accurately¶

Project Information¶

Packages¶

Merging data¶

Data Cleanup¶

Missing Values¶

Impute rest¶

EDA¶

Hitting EDA¶

Pitching EDA¶

Feature Selection/ Engineering¶

Hitting Modeling¶

Random Forest Hitting¶

Gradient Boost Hitting¶

Cross-Validation Performance for Thresholds¶

Final Model Performance on Test Set¶

Summary¶

Comparison of Feature Importances in Random Forest and Gradient Boosting Models¶

XGBoost with SMOTE¶

Pitching Models¶

Gradient Boost Pitching¶

Gradient Boost with SMOTE¶

Gradient boost with Smote¶