!pip install optuna

Requirement already satisfied: optuna in /usr/local/lib/python3.10/dist-packages (4.1.0)
Requirement already satisfied: alembic>=1.5.0 in /usr/local/lib/python3.10/dist-packages (from optuna) (1.14.0)
Requirement already satisfied: colorlog in /usr/local/lib/python3.10/dist-packages (from optuna) (6.9.0)
Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from optuna) (1.26.4)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from optuna) (24.2)
Requirement already satisfied: sqlalchemy>=1.4.2 in /usr/local/lib/python3.10/dist-packages (from optuna) (2.0.36)
Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from optuna) (4.66.6)
Requirement already satisfied: PyYAML in /usr/local/lib/python3.10/dist-packages (from optuna) (6.0.2)
Requirement already satisfied: Mako in /usr/local/lib/python3.10/dist-packages (from alembic>=1.5.0->optuna) (1.3.8)
Requirement already satisfied: typing-extensions>=4 in /usr/local/lib/python3.10/dist-packages (from alembic>=1.5.0->optuna) (4.12.2)
Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from sqlalchemy>=1.4.2->optuna) (3.1.1)
Requirement already satisfied: MarkupSafe>=0.9.2 in /usr/local/lib/python3.10/dist-packages (from Mako->alembic>=1.5.0->optuna) (3.0.2)

# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing and feature engineering
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, LabelEncoder
from sklearn.feature_selection import RFE

# Model training and evaluation
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.linear_model import RidgeCV, LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.cluster import KMeans

# Metrics
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score,
    mean_absolute_percentage_error, explained_variance_score,
    classification_report, confusion_matrix
)

# Dimensionality reduction
from sklearn.decomposition import PCA

# Deep learning
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

# Optimization
import optuna

# Statistical analysis
from scipy.stats import f_oneway

# Warnings
import warnings
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=Warning)

# Load datasets
career_batting = pd.read_csv('/content/career batting.csv')
contracts = pd.read_csv('/content/contracts.csv')
hitting = pd.read_csv('/content/hitting.csv')

# Step 1: Standardize and Align the 'Player' Column
# Fix Career Batting dataset
career_batting.rename(columns={'Name': 'Player'}, inplace=True)

# Fix Hitting dataset: Split 'last_name, first_name' into 'Player'
hitting['Player'] = hitting['last_name, first_name'].str.split(', ').str[::-1].str.join(' ')
hitting['Player'] = hitting['Player'].str.strip().str.title()

# Standardize 'Player' column in all datasets
contracts['Player'] = contracts['Player'].str.strip().str.title()
career_batting['Player'] = career_batting['Player'].str.strip().str.title()

# Step 2: Merge Datasets
data = pd.merge(hitting, contracts, on='Player', how='inner')
data = pd.merge(data, career_batting, on='Player', how='inner')

# Step 3: Display Merge Results
print(f"Merged dataset contains {data.shape[0]} rows and {data.shape[1]} columns.")
print(data.head())

Merged dataset contains 427 rows and 64 columns.
  last_name, first_name  player_id  year  home_run   woba  xwoba  \
0         DeLuca, Jonny     676356  2024         6  0.269  0.272   
1       Marte, Starling     516782  2024         7  0.313  0.337   
2           Brown, Seth     664913  2024        14  0.289  0.294   
3       Andujar, Miguel     609280  2024         4  0.306  0.286   
4        Correa, Carlos     621043  2024        14  0.385  0.358   

   avg_swing_speed  blasts_contact  blasts_swing  squared_up_contact  ...  \
0             70.8             8.0           6.2                26.1  ...   
1             72.4            17.2          12.4                33.6  ...   
2             73.3            14.5          10.5                30.9  ...   
3             71.6            11.6           9.9                31.0  ...   
4             74.5            19.9          16.2                32.2  ...   

        AVG       OBP       SLG      wOBA  xwOBA        wRC+       BsR  \
0  0.216867  0.277778  0.331325  0.268877  0.277   76.974637  0.649684   
1  0.268657  0.326975  0.388060  0.313053  0.338  103.886335  2.114106   
2  0.231183  0.282500  0.379032  0.289137  0.296   91.145522 -0.659635   
3  0.284768  0.319749  0.377483  0.305782  0.287  102.854265 -2.035956   
4  0.310345  0.387978  0.517241  0.385305  0.359  155.497900 -0.577044   

         Off        Def       WAR  
0  -8.887511   5.785232  0.925904  
1   3.833610 -10.129863  0.622260  
2  -4.712187 -10.903075 -0.235582  
3  -0.994141  -5.335523  0.444545  
4  22.727909   6.440044  4.275914  

[5 rows x 64 columns]

# Rename columns for clarity and usability
data.rename(columns={
    'Team\n                        Currently With': 'Current Team',
    'Age\n                        At Signing': 'Age at Signing'
}, inplace=True)

# Display the updated column names to confirm changes
print(data.columns)

Index(['last_name, first_name', 'player_id', 'year', 'home_run', 'woba',
       'xwoba', 'avg_swing_speed', 'blasts_contact', 'blasts_swing',
       'squared_up_contact', 'squared_up_swing', 'avg_swing_length', 'swords',
       'exit_velocity_avg', 'launch_angle_avg', 'sweet_spot_percent', 'barrel',
       'barrel_batted_rate', 'solidcontact_percent', 'flareburner_percent',
       'poorlyunder_percent', 'poorlytopped_percent', 'poorlyweak_percent',
       'hard_hit_percent', 'avg_best_speed', 'avg_hyper_speed',
       'z_swing_percent', 'z_swing_miss_percent', 'oz_swing_percent',
       'oz_swing_miss_percent', 'out_zone_swing', 'meatball_swing_percent',
       'meatball_percent', 'iz_contact_percent', 'in_zone_swing_miss',
       'whiff_percent', 'swing_percent', 'Player', 'Rank', 'Pos',
       'Current Team', 'Age at Signing', 'Value', 'AAV', 'G', 'PA', 'HR', 'R',
       'RBI', 'SB', 'BB%', 'K%', 'ISO', 'BABIP', 'AVG', 'OBP', 'SLG', 'wOBA',
       'xwOBA', 'wRC+', 'BsR', 'Off', 'Def', 'WAR'],
      dtype='object')

# Remove columns starting with 'x' (e.g., xWOBA, xwOBA, etc.)
columns_to_drop = [col for col in data.columns if col.lower().startswith('x')]
data = data.drop(columns=columns_to_drop, errors='ignore')

# Display updated dataset columns to confirm
print(f"Remaining columns in the dataset: {data.columns.tolist()}")

Remaining columns in the dataset: ['last_name, first_name', 'player_id', 'year', 'home_run', 'woba', 'avg_swing_speed', 'blasts_contact', 'blasts_swing', 'squared_up_contact', 'squared_up_swing', 'avg_swing_length', 'swords', 'exit_velocity_avg', 'launch_angle_avg', 'sweet_spot_percent', 'barrel', 'barrel_batted_rate', 'solidcontact_percent', 'flareburner_percent', 'poorlyunder_percent', 'poorlytopped_percent', 'poorlyweak_percent', 'hard_hit_percent', 'avg_best_speed', 'avg_hyper_speed', 'z_swing_percent', 'z_swing_miss_percent', 'oz_swing_percent', 'oz_swing_miss_percent', 'out_zone_swing', 'meatball_swing_percent', 'meatball_percent', 'iz_contact_percent', 'in_zone_swing_miss', 'whiff_percent', 'swing_percent', 'Player', 'Rank', 'Pos', 'Current Team', 'Age at Signing', 'Value', 'AAV', 'G', 'PA', 'HR', 'R', 'RBI', 'SB', 'BB%', 'K%', 'ISO', 'BABIP', 'AVG', 'OBP', 'SLG', 'wOBA', 'wRC+', 'BsR', 'Off', 'Def', 'WAR']

# Convert AAV to numeric by removing dollar signs and commas
data['AAV'] = data['AAV'].replace({'\$': '', ',': ''}, regex=True).astype(float)

# Plot the distribution of AAV
plt.figure(figsize=(10, 6))
plt.hist(data['AAV'], bins=30, edgecolor='k', alpha=0.7)
plt.title('Distribution of Average Annual Value (AAV)', fontsize=14)
plt.xlabel('AAV ($)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

plt.figure(figsize=(10, 6))
sns.scatterplot(x=data['woba'], y=data['AAV'])
plt.title('Scatter Plot of AAV vs wOBA', fontsize=14)
plt.xlabel('wOBA', fontsize=12)
plt.ylabel('AAV ($)', fontsize=12)
plt.grid(alpha=0.3)
plt.show()

# Filter numeric columns for correlation calculation, excluding irrelevant features
numeric_data = data.select_dtypes(include=['float64', 'int64']).drop(columns=['player_id'], errors='ignore')

# Identify the top 10 features most correlated with AAV
correlation_with_aav = numeric_data.corr()['AAV'].abs().sort_values(ascending=False)
top_10_features = correlation_with_aav.index[1:11]  # Exclude 'AAV' itself

# Generate a heatmap for the top 10 features
plt.figure(figsize=(12, 8))
sns.heatmap(numeric_data[top_10_features].corr(), annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix of Top 10 Features Most Correlated with AAV (Excluding Less Relevent Columns)', fontsize=14)
plt.show()

# Select the top features from the correlation heatmap for pairplot
top_features_correlation = ['barrel', 'home_run', 'HR', 'Off', 'RBI', 'R', 'WAR', 'PA', 'out_zone_swing', 'woba']

# Generate pairplot for the selected features
sns.pairplot(data[top_features_correlation], diag_kind='kde', plot_kws={'alpha': 0.7})
plt.suptitle('Pairwise Scatter Plots of Features from Correlation Heatmap', y=1.02, fontsize=14)
plt.show()

Output hidden; open in https://colab.research.google.com to view.

# Step 1: Drop irrelevant columns that won't affect clustering
columns_to_drop = ['Current Team', 'Value', 'Pos', 'player_id', 'Player']
numeric_data = data.drop(columns=columns_to_drop, errors='ignore')

# Step 2: Retain only numeric columns
numeric_data = numeric_data.select_dtypes(include=['float64', 'int64'])

# Step 3: Define X (features) and y (target - AAV for salary prediction)
X = numeric_data.drop(columns=['AAV'], errors='ignore')  # Drop 'AAV' from features
y = data['AAV']  # Target variable is the player's salary (AAV)

# Step 4: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Apply Ridge Regression with cross-validation to select the optimal alpha
ridge = RidgeCV(alphas=np.logspace(-4, 4, 100), cv=5).fit(X_train, y_train)

# Step 6: Output model performance
print(f"Optimal Alpha: {ridge.alpha_}")
print(f"Training Score: {ridge.score(X_train, y_train)}")
print(f"Testing Score: {ridge.score(X_test, y_test)}")

Optimal Alpha: 3944.206059437664
Training Score: 0.488496115355994
Testing Score: 0.47744649196711564

# Step 1: Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Features have been standardized.")

Features have been standardized.

# Step 2: Determine the optimal number of clusters using the Elbow Method
wcss = []  # Within-cluster sum of squares
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)

# Plot the Elbow Method
plt.figure(figsize=(8, 5))
plt.plot(range(1, 11), wcss, marker='o')
plt.title('Elbow Method for Optimal Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()

optimal_clusters = 3
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(X_scaled)

# Calculate cluster centers
cluster_centers = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)

# Calculate the range of each feature across clusters
feature_ranges = cluster_centers.max() - cluster_centers.min()

# Rank features by their range across clusters
cluster_feature_importance = feature_ranges.sort_values(ascending=False)
print("Feature Importance for Clusters (based on range):")
print(cluster_feature_importance)

Feature Importance for Clusters (based on range):
RBI                       1.782278
home_run                  1.763592
HR                        1.763592
barrel                    1.751891
R                         1.705439
PA                        1.700571
out_zone_swing            1.648301
in_zone_swing_miss        1.644526
G                         1.632397
SLG                       1.569091
wOBA                      1.535930
woba                      1.534729
wRC+                      1.532636
K%                        1.501313
barrel_batted_rate        1.469382
avg_swing_speed           1.452471
ISO                       1.451022
WAR                       1.444572
avg_hyper_speed           1.414773
avg_best_speed            1.404051
whiff_percent             1.369697
squared_up_swing          1.358466
swords                    1.336693
oz_swing_miss_percent     1.326162
iz_contact_percent        1.276464
z_swing_miss_percent      1.263196
blasts_contact            1.254969
OBP                       1.232009
hard_hit_percent          1.230051
Off                       1.209755
AVG                       1.197710
exit_velocity_avg         1.193627
blasts_swing              1.184209
squared_up_contact        0.979993
avg_swing_length          0.851049
poorlyweak_percent        0.772629
SB                        0.720279
solidcontact_percent      0.666515
BABIP                     0.592202
meatball_percent          0.555028
flareburner_percent       0.545582
Def                       0.517075
poorlytopped_percent      0.486297
BB%                       0.477383
z_swing_percent           0.451283
sweet_spot_percent        0.428835
meatball_swing_percent    0.425121
Age at Signing            0.404148
launch_angle_avg          0.356471
swing_percent             0.228362
poorlyunder_percent       0.155923
BsR                       0.126523
oz_swing_percent          0.035725
year                      0.000000
dtype: float64

# Select top 10 features by importance
top_features = cluster_feature_importance.head(10).index.tolist()

# Examine cluster centers for top features
top_cluster_centers = cluster_centers[top_features]
print("Cluster Centers for Top Features:")
print(top_cluster_centers)

# Visualize cluster centers to identify patterns
plt.figure(figsize=(10, 6))
top_cluster_centers.T.plot(kind='bar', figsize=(14, 8), colormap='viridis')
plt.title("Top Feature Averages Across Clusters", fontsize=16)
plt.xlabel("Features", fontsize=14)
plt.ylabel("Cluster Center Values", fontsize=14)
plt.legend(title="Cluster", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

Cluster Centers for Top Features:
        RBI  home_run        HR    barrel         R        PA  out_zone_swing  \
0  0.850462  0.955295  0.955295  0.977622  0.764058  0.718688        0.725266   
1  0.098277 -0.177575 -0.177575 -0.245649  0.214206  0.317940        0.238906   
2 -0.931817 -0.808296 -0.808296 -0.774270 -0.941381 -0.981883       -0.923036   

   in_zone_swing_miss         G       SLG  
0            0.905643  0.654965  0.782040  
1           -0.201446  0.389540  0.006054  
2           -0.738883 -0.977432 -0.787052

<Figure size 1000x600 with 0 Axes>

# Add the numeric cluster labels from KMeans to the dataset
data['Cluster'] = cluster_labels  # This adds the numeric cluster labels (0, 1, 2) to the 'Cluster' column in the DataFrame

# Define a dictionary for cluster labels
cluster_labels_map = {
    0: "Power Hitters",      # Cluster 0 corresponds to Power Hitters
    1: "Balanced Hitters",   # Cluster 1 corresponds to Balanced Hitters
    2: "Utility Players"     # Cluster 2 corresponds to Utility Players
}

# Map the descriptive cluster labels to the numeric clusters
data['Cluster_Label'] = data['Cluster'].map(cluster_labels_map)

# Check if the mapping is successful by displaying a sample of the dataset
print(data[['Cluster', 'Cluster_Label']].drop_duplicates())

   Cluster     Cluster_Label
0        2   Utility Players
1        0     Power Hitters
3        1  Balanced Hitters

# Step 1: Predict salaries using the trained Ridge model
y_test_pred = ridge.predict(X_test)

# Step 2: Define a threshold for fairness (10% of average salary as an example)
threshold = 0.1 * y.mean()

# Step 3: Define salary fairness categories
def classify_salary(actual, predicted, threshold):
    if actual < predicted - threshold:
        return "Underpaid"
    elif actual > predicted + threshold:
        return "Overpaid"
    else:
        return "Fairly Paid"

# Step 4: Apply the classification logic to the test set
fairness_labels = [
    classify_salary(actual, predicted, threshold)
    for actual, predicted in zip(y_test, y_test_pred)
]

# Step 5: Combine the results into a new DataFrame
fairness_results = X_test.copy()
fairness_results['Actual Salary'] = y_test
fairness_results['Predicted Salary'] = y_test_pred
fairness_results['Fairness'] = fairness_labels

# Display a summary of the fairness classification
print(fairness_results[['Actual Salary', 'Predicted Salary', 'Fairness']].head(10))

     Actual Salary  Predicted Salary     Fairness
419      4300000.0      4.671809e+06  Fairly Paid
75        800000.0      1.828226e+06    Underpaid
177       800000.0      4.191798e+06    Underpaid
30        800000.0      2.206821e+06    Underpaid
358     12166667.0      2.783316e+06     Overpaid
271       800000.0      1.493708e+06    Underpaid
155      6750000.0      5.686262e+06     Overpaid
152       800000.0      1.996744e+06    Underpaid
165     16250000.0      2.218170e+07    Underpaid
175     51000000.0      2.838382e+07     Overpaid

# Plot the fairness distribution
plt.figure(figsize=(8, 5))
sns.countplot(data=fairness_results, x='Fairness', order=["Underpaid", "Fairly Paid", "Overpaid"])
plt.title("Salary Fairness Distribution", fontsize=16)
plt.xlabel("Fairness Category", fontsize=14)
plt.ylabel("Count", fontsize=14)
plt.tight_layout()
plt.show()

# Evaluate Ridge regression performance
mse = mean_squared_error(y_test, y_test_pred)
mae = mean_absolute_error(y_test, y_test_pred)
r2 = r2_score(y_test, y_test_pred)

print("Model Performance Metrics:")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R-squared (R2): {r2:.2f}")

# Evaluate classification distribution
fairness_distribution = fairness_results['Fairness'].value_counts(normalize=True) * 100
print("\nFairness Classification Distribution:")
print(fairness_distribution)

# Visualize residuals
residuals = y_test - y_test_pred
plt.figure(figsize=(8, 5))
plt.scatter(y_test, residuals, alpha=0.7)
plt.axhline(y=0, color='r', linestyle='--')
plt.title("Residuals of Actual vs. Predicted Salary")
plt.xlabel("Actual Salary")
plt.ylabel("Residuals")
plt.show()

Model Performance Metrics:
Mean Squared Error (MSE): 52552780126582.57
Mean Absolute Error (MAE): 4420454.71
R-squared (R2): 0.48

Fairness Classification Distribution:
Fairness
Overpaid       45.348837
Underpaid      40.697674
Fairly Paid    13.953488
Name: proportion, dtype: float64

# Use RFE with Linear Regression
selector = RFE(estimator=LinearRegression(), n_features_to_select=10)  # Adjust the number of features
selector = selector.fit(X_train, y_train)

# Get selected features
selected_features = X.columns[selector.support_]
print("Selected Features by RFE:")
print(selected_features)

# Train the model using only selected features
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

linear_model_rfe = LinearRegression()
linear_model_rfe.fit(X_train_selected, y_train)
y_test_pred_rfe = linear_model_rfe.predict(X_test_selected)

# Evaluate the RFE model
mse_rfe = mean_squared_error(y_test, y_test_pred_rfe)
mae_rfe = mean_absolute_error(y_test, y_test_pred_rfe)
r2_rfe = r2_score(y_test, y_test_pred_rfe)

print("RFE Model Performance:")
print(f"Mean Squared Error (MSE): {mse_rfe:.2f}")
print(f"Mean Absolute Error (MAE): {mae_rfe:.2f}")
print(f"R-squared (R2): {r2_rfe:.2f}")

Selected Features by RFE:
Index(['woba', 'BB%', 'K%', 'ISO', 'BABIP', 'AVG', 'OBP', 'SLG', 'wOBA',
       'WAR'],
      dtype='object')
RFE Model Performance:
Mean Squared Error (MSE): 60572139713454.79
Mean Absolute Error (MAE): 5034050.13
R-squared (R2): 0.40

# Generate polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Train a linear model on polynomial features
poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)
y_test_pred_poly = poly_model.predict(X_test_poly)

# Evaluate the polynomial model
mse_poly = mean_squared_error(y_test, y_test_pred_poly)
mae_poly = mean_absolute_error(y_test, y_test_pred_poly)
r2_poly = r2_score(y_test, y_test_pred_poly)

print("Polynomial Regression Model Performance:")
print(f"Mean Squared Error (MSE): {mse_poly:.2f}")
print(f"Mean Absolute Error (MAE): {mae_poly:.2f}")
print(f"R-squared (R2): {r2_poly:.2f}")

Polynomial Regression Model Performance:
Mean Squared Error (MSE): 366931699370140.38
Mean Absolute Error (MAE): 14483942.33
R-squared (R2): -2.65

# Step 1: Apply Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Evaluate Random Forest Model
mse_rf = mean_squared_error(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("Random Forest Model Performance:")
print(f"Mean Squared Error (MSE): {mse_rf:.2f}")
print(f"Mean Absolute Error (MAE): {mae_rf:.2f}")
print(f"R-squared (R2): {r2_rf:.2f}")

# Step 2: Apply Gradient Boosting Regressor
gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)

# Evaluate Gradient Boosting Model
mse_gb = mean_squared_error(y_test, y_pred_gb)
mae_gb = mean_absolute_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

print("\nGradient Boosting Model Performance:")
print(f"Mean Squared Error (MSE): {mse_gb:.2f}")
print(f"Mean Absolute Error (MAE): {mae_gb:.2f}")
print(f"R-squared (R2): {r2_gb:.2f}")

Random Forest Model Performance:
Mean Squared Error (MSE): 58179247382947.98
Mean Absolute Error (MAE): 4567600.40
R-squared (R2): 0.42

Gradient Boosting Model Performance:
Mean Squared Error (MSE): 58321976580156.20
Mean Absolute Error (MAE): 4581316.48
R-squared (R2): 0.42

# Ensure only numeric columns are used for correlation
numeric_data = data.select_dtypes(include=['float64', 'int64'])

# Check correlations with AAV
if 'AAV' in numeric_data.columns:
    correlation_with_aav = numeric_data.corr()['AAV'].sort_values(ascending=False)
    print("Correlation of features with AAV:")
    print(correlation_with_aav.head(20))  # Display top 20 correlations
else:
    print("Error: 'AAV' column not found in the numeric data.")

Correlation of features with AAV:
AAV                   1.000000
barrel                0.599061
home_run              0.591440
HR                    0.591440
Off                   0.560521
RBI                   0.554944
R                     0.544367
WAR                   0.542641
PA                    0.481392
out_zone_swing        0.433293
woba                  0.421734
wOBA                  0.421301
wRC+                  0.420590
SLG                   0.408186
in_zone_swing_miss    0.406619
G                     0.383526
OBP                   0.381380
ISO                   0.370662
blasts_swing          0.357841
avg_hyper_speed       0.330052
Name: AAV, dtype: float64

# Step 2: Remove features with low correlation (e.g., less than 0.3)
low_corr_threshold = 0.3  # Adjust threshold as needed
low_corr_features = correlation_with_aav[correlation_with_aav.abs() < low_corr_threshold].index.tolist()
data = data.drop(columns=low_corr_features, errors='ignore')
print(f"Removed low correlation features: {low_corr_features}")

# Step 3: Add interaction terms for high-correlation features
data['home_run_RBI'] = data['home_run'] * data['RBI']  # Interaction of power metrics
data['barrel_WAR'] = data['barrel'] * data['WAR']      # Interaction of power and value metrics
data['PA_R'] = data['PA'] * data['R']                 # Interaction of plate appearances and runs

# Step 4: Prepare dataset again for modeling
numeric_data = data.select_dtypes(include=['float64', 'int64'])
X = numeric_data.drop(columns=['AAV'], errors='ignore')  # Features
y = data['AAV']  # Target variable

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Train and evaluate Random Forest and Gradient Boosting again

# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print("\nRandom Forest Updated Metrics:")
print(f"R-squared: {r2_score(y_test, y_pred_rf):.2f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred_rf):.2f}")
print(f"MSE: {mean_squared_error(y_test, y_pred_rf):.2f}")

# Gradient Boosting
gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)
print("\nGradient Boosting Updated Metrics:")
print(f"R-squared: {r2_score(y_test, y_pred_gb):.2f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred_gb):.2f}")
print(f"MSE: {mean_squared_error(y_test, y_pred_gb):.2f}")

Removed low correlation features: ['hard_hit_percent', 'swords', 'BB%', 'Age at Signing', 'avg_swing_speed', 'SB', 'avg_swing_length', 'solidcontact_percent', 'squared_up_swing', 'sweet_spot_percent', 'launch_angle_avg', 'iz_contact_percent', 'squared_up_contact', 'BABIP', 'z_swing_percent', 'poorlyunder_percent', 'meatball_swing_percent', 'BsR', 'flareburner_percent', 'meatball_percent', 'swing_percent', 'oz_swing_percent', 'z_swing_miss_percent', 'whiff_percent', 'Def', 'oz_swing_miss_percent', 'poorlytopped_percent', 'poorlyweak_percent', 'K%']

Random Forest Updated Metrics:
R-squared: 0.59
MAE: 3809383.21
MSE: 41409893747045.52

Gradient Boosting Updated Metrics:
R-squared: 0.58
MAE: 3828080.14
MSE: 41900967779212.24

# Random Forest Parameter Grid
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Gradient Boosting Parameter Grid
gb_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Random Forest Randomized Search
rf_random_search = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_distributions=rf_param_grid,
    n_iter=20,  # Number of random combinations to try
    scoring='neg_mean_squared_error',
    cv=3,
    n_jobs=-1,
    random_state=42,
    verbose=2
)
rf_random_search.fit(X_train, y_train)
print("Best Random Forest Parameters:", rf_random_search.best_params_)
print("Best Random Forest Score (MSE):", -rf_random_search.best_score_)

# Gradient Boosting Randomized Search
gb_random_search = RandomizedSearchCV(
    estimator=GradientBoostingRegressor(random_state=42),
    param_distributions=gb_param_grid,
    n_iter=20,  # Number of random combinations to try
    scoring='neg_mean_squared_error',
    cv=3,
    n_jobs=-1,
    random_state=42,
    verbose=2
)
gb_random_search.fit(X_train, y_train)
print("\nBest Gradient Boosting Parameters:", gb_random_search.best_params_)
print("Best Gradient Boosting Score (MSE):", -gb_random_search.best_score_)

# Evaluate Random Forest
y_pred_rf = rf_random_search.best_estimator_.predict(X_test)

mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mape_rf = mean_absolute_percentage_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
explained_variance_rf = explained_variance_score(y_test, y_pred_rf)

print("\nRandom Forest Final Metrics:")
print(f"MSE: {mse_rf:.2f}")
print(f"RMSE: {rmse_rf:.2f}")
print(f"MAE: {mae_rf:.2f}")
print(f"MAPE: {mape_rf:.2%}")  # Show as percentage
print(f"R-squared: {r2_rf:.2f}")
print(f"Explained Variance Score: {explained_variance_rf:.2f}")

# Evaluate Gradient Boosting
y_pred_gb = gb_random_search.best_estimator_.predict(X_test)

mse_gb = mean_squared_error(y_test, y_pred_gb)
rmse_gb = np.sqrt(mse_gb)
mae_gb = mean_absolute_error(y_test, y_pred_gb)
mape_gb = mean_absolute_percentage_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)
explained_variance_gb = explained_variance_score(y_test, y_pred_gb)

print("\nGradient Boosting Final Metrics:")
print(f"MSE: {mse_gb:.2f}")
print(f"RMSE: {rmse_gb:.2f}")
print(f"MAE: {mae_gb:.2f}")
print(f"MAPE: {mape_gb:.2%}")  # Show as percentage
print(f"R-squared: {r2_gb:.2f}")
print(f"Explained Variance Score: {explained_variance_gb:.2f}")

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Random Forest Parameters: {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': None}
Best Random Forest Score (MSE): 42198347442005.25
Fitting 3 folds for each of 20 candidates, totalling 60 fits

Best Gradient Boosting Parameters: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': 7, 'learning_rate': 0.1}
Best Gradient Boosting Score (MSE): 43175407439658.445

Random Forest Final Metrics:
MSE: 39907321734235.45
RMSE: 6317224.21
MAE: 3822361.80
MAPE: 143.91%
R-squared: 0.60
Explained Variance Score: 0.60

Gradient Boosting Final Metrics:
MSE: 40593618058757.37
RMSE: 6371312.11
MAE: 3833741.15
MAPE: 146.88%
R-squared: 0.60
Explained Variance Score: 0.60

# Step 1: Ensure Clusters and Fairness Labels Exist
# Add the clusters from the K-Means model
data['Cluster'] = kmeans.labels_  # Use the labels from the K-Means model
cluster_labels_map = {
    0: "Power Hitters",      # High in RBI, HR, SLG, and related metrics
    1: "Balanced Hitters",   # Moderate across all metrics
    2: "Utility Players"     # Lower contributions across all metrics
}
data['Cluster_Label'] = data['Cluster'].map(cluster_labels_map)

# Add the 'Fairness' column if it doesn't exist
if 'Predicted Salary' not in data.columns:
    data['Predicted Salary'] = rf_model.predict(data[top_features + ['Cluster']])

threshold = 0.1 * data['AAV'].mean()

def classify_salary(actual, predicted, threshold):
    if actual < predicted - threshold:
        return "Underpaid"
    elif actual > predicted + threshold:
        return "Overpaid"
    else:
        return "Fairly Paid"

data['Fairness'] = [
    classify_salary(actual, predicted, threshold)
    for actual, predicted in zip(data['AAV'], data['Predicted Salary'])
]

# Map Fairness to numeric labels for SVM
data['Fairness_Label'] = data['Fairness'].map({'Fairly Paid': 0, 'Underpaid': 1, 'Overpaid': 2})

# Step 2: Prepare the SVM Data
X = data[top_features + ['Cluster']]  # Include cluster information as a feature
y = data['Fairness_Label']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 3: Simplified Random Search with Reduced Trials
reduced_svm_param_grid = {
    'C': [0.1, 1, 10],        # Limited options for regularization strength
    'kernel': ['linear', 'rbf'],  # Focus on common kernels
    'gamma': ['scale']         # Use default gamma for simplicity
}

svm_model = SVC(random_state=42)
svm_random_search = RandomizedSearchCV(
    estimator=svm_model,
    param_distributions=reduced_svm_param_grid,
    n_iter=5,  # Fewer trials for faster results
    scoring='accuracy',
    cv=2,  # Reduce cross-validation folds
    n_jobs=-1,
    random_state=42,
    verbose=1  # Reduce verbosity
)
svm_random_search.fit(X_train, y_train)

# Step 4: Evaluate the SVM Model
best_svm = svm_random_search.best_estimator_
y_pred = best_svm.predict(X_test)

print("Simplified Random Search Best Parameters:", svm_random_search.best_params_)
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Fairly Paid', 'Underpaid', 'Overpaid']))

# Step 5: Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues",
            xticklabels=['Fairly Paid', 'Underpaid', 'Overpaid'],
            yticklabels=['Fairly Paid', 'Underpaid', 'Overpaid'])
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

# Step 6: Visualize Fairness by Cluster
fairness_cluster_summary = data.groupby(['Cluster', 'Fairness']).size().unstack(fill_value=0)
fairness_cluster_summary.plot(kind='bar', stacked=True, figsize=(12, 6), colormap='viridis')
plt.title("Fairness Categories by Clusters")
plt.xlabel("Player Cluster")
plt.ylabel("Number of Players")
plt.legend(title="Fairness Category")
plt.tight_layout()
plt.show()

Fitting 2 folds for each of 5 candidates, totalling 10 fits
Simplified Random Search Best Parameters: {'kernel': 'rbf', 'gamma': 'scale', 'C': 0.1}

Classification Report:
              precision    recall  f1-score   support

 Fairly Paid       0.65      0.78      0.70        40
   Underpaid       0.42      0.55      0.48        29
    Overpaid       0.00      0.00      0.00        17

    accuracy                           0.55        86
   macro avg       0.36      0.44      0.39        86
weighted avg       0.44      0.55      0.49        86

/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

# Step 1: Ensure Clusters and Fairness Labels Exist
data['Fairness_Label'] = data['Fairness'].map({'Fairly Paid': 0, 'Underpaid': 1, 'Overpaid': 2})
X = data[top_features + ['Cluster']]  # Use top features and cluster as predictors
y = data['Fairness_Label']

# Step 2: Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 3: Random Forest with Random Search
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=rf_param_grid,
    n_iter=10,  # Fewer iterations for faster results
    scoring='accuracy',
    cv=3,
    n_jobs=-1,
    random_state=42,
    verbose=2
)
rf_random_search.fit(X_train, y_train)
rf_best = rf_random_search.best_estimator_

# Random Forest Evaluation
rf_y_pred = rf_best.predict(X_test)
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, rf_y_pred, target_names=['Fairly Paid', 'Underpaid', 'Overpaid']))

# Confusion Matrix for Random Forest
conf_matrix_rf = confusion_matrix(y_test, rf_y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_rf, annot=True, fmt="d", cmap="Blues",
            xticklabels=['Fairly Paid', 'Underpaid', 'Overpaid'],
            yticklabels=['Fairly Paid', 'Underpaid', 'Overpaid'])
plt.title("Confusion Matrix - Random Forest")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

# Step 4: Gradient Boosting with Random Search
gb_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

gb_random_search = RandomizedSearchCV(
    estimator=GradientBoostingClassifier(random_state=42),
    param_distributions=gb_param_grid,
    n_iter=10,  # Fewer iterations for faster results
    scoring='accuracy',
    cv=3,
    n_jobs=-1,
    random_state=42,
    verbose=2
)
gb_random_search.fit(X_train, y_train)
gb_best = gb_random_search.best_estimator_

# Gradient Boosting Evaluation
gb_y_pred = gb_best.predict(X_test)
print("\nGradient Boosting Classification Report:")
print(classification_report(y_test, gb_y_pred, target_names=['Fairly Paid', 'Underpaid', 'Overpaid']))

# Confusion Matrix for Gradient Boosting
conf_matrix_gb = confusion_matrix(y_test, gb_y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_gb, annot=True, fmt="d", cmap="Blues",
            xticklabels=['Fairly Paid', 'Underpaid', 'Overpaid'],
            yticklabels=['Fairly Paid', 'Underpaid', 'Overpaid'])
plt.title("Confusion Matrix - Gradient Boosting")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

# Step 5: Fairness Distribution by Clusters
fairness_cluster_summary = data.groupby(['Cluster', 'Fairness']).size().unstack(fill_value=0)
fairness_cluster_summary.plot(kind='bar', stacked=True, figsize=(12, 6), colormap='viridis')
plt.title("Fairness Categories by Clusters")
plt.xlabel("Player Cluster")
plt.ylabel("Number of Players")
plt.legend(title="Fairness Category")
plt.tight_layout()
plt.show()

Fitting 3 folds for each of 10 candidates, totalling 30 fits

Random Forest Classification Report:
              precision    recall  f1-score   support

 Fairly Paid       0.63      0.72      0.67        40
   Underpaid       0.35      0.38      0.37        29
    Overpaid       0.44      0.24      0.31        17

    accuracy                           0.51        86
   macro avg       0.48      0.45      0.45        86
weighted avg       0.50      0.51      0.50        86

Fitting 3 folds for each of 10 candidates, totalling 30 fits

Gradient Boosting Classification Report:
              precision    recall  f1-score   support

 Fairly Paid       0.68      0.80      0.74        40
   Underpaid       0.50      0.52      0.51        29
    Overpaid       0.22      0.12      0.15        17

    accuracy                           0.57        86
   macro avg       0.47      0.48      0.47        86
weighted avg       0.53      0.57      0.54        86

# Define the mapping for clusters to player categories
cluster_labels_map = {
    0: "Power Hitters",
    1: "Balanced Hitters",
    2: "Utility Players"
}

# Replace the cluster numbers in the table with their corresponding labels
fairness_cluster_summary.index = fairness_cluster_summary.index.map(cluster_labels_map)

# Display the updated table
print(fairness_cluster_summary)

Fairness          Fairly Paid  Overpaid  Underpaid
Cluster                                           
Power Hitters              36        46         69
Balanced Hitters           45        29         51
Utility Players           116         8         27

fairness_proportions = fairness_cluster_summary.div(fairness_cluster_summary.sum(axis=1), axis=0)
print(fairness_proportions)

Fairness          Fairly Paid  Overpaid  Underpaid
Cluster                                           
Power Hitters        0.238411  0.304636   0.456954
Balanced Hitters     0.360000  0.232000   0.408000
Utility Players      0.768212  0.052980   0.178808

fairness_cluster_summary.plot(kind='bar', stacked=True, figsize=(12, 6), colormap='viridis')
plt.title("Fairness Categories by Clusters")
plt.xlabel("Player Cluster")
plt.ylabel("Number of Players")
plt.legend(title="Fairness Category")
plt.tight_layout()
plt.show()

sns.boxplot(x='Cluster', y='AAV', hue='Fairness', data=data)
plt.title("Actual Salary Distribution by Cluster and Fairness")
plt.show()

sns.boxplot(x='Cluster', y='Predicted Salary', hue='Fairness', data=data)
plt.title("Predicted Salary Distribution by Cluster and Fairness")
plt.show()

# Step 1: Normalize the feature set
scaler = StandardScaler()
numeric_features = ['barrel', 'home_run', 'HR', 'RBI', 'SLG', 'Predicted Salary', 'Cluster']  # Include relevant features
X = data[numeric_features]
X_scaled = scaler.fit_transform(X)

# Step 2: Encode the target variable (Fairness Labels)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(data['Fairness_Label'])  # Ensure 'Fairness_Label' is numeric (0, 1, 2)
y_categorical = to_categorical(y_encoded)  # Convert to one-hot encoding for multi-class classification

# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_categorical, test_size=0.2, random_state=42, stratify=y_encoded)

print("Data prepared:")
print(f"Training set: {X_train.shape}, {y_train.shape}")
print(f"Testing set: {X_test.shape}, {y_test.shape}")

Data prepared:
Training set: (341, 7), (341, 3)
Testing set: (86, 7), (86, 3)

# Step 1: Correct column alignment for X_train and X_test
X_train_df = pd.DataFrame(X_train, columns=data[top_features].columns[:X_train.shape[1]])
X_test_df = pd.DataFrame(X_test, columns=data[top_features].columns[:X_test.shape[1]])

# Ensure 'Cluster' is included in the datasets
X_train_df['Cluster'] = data.loc[X_train_df.index, 'Cluster'].values
X_test_df['Cluster'] = data.loc[X_test_df.index, 'Cluster'].values

# Convert back to numpy arrays
X_train_with_clusters = X_train_df.values
X_test_with_clusters = X_test_df.values

# Step 2: Build the Neural Network Model
model = Sequential([
    Dense(64, input_dim=X_train_with_clusters.shape[1], activation='relu'),  # Input layer with 64 neurons
    Dropout(0.3),  # Dropout to prevent overfitting
    Dense(32, activation='relu'),  # Hidden layer with 32 neurons
    Dropout(0.3),
    Dense(16, activation='relu'),  # Another hidden layer with 16 neurons
    Dense(y_train.shape[1], activation='softmax')  # Output layer with softmax for multi-class classification
])

# Compile the model
model.compile(optimizer='adam',  # Adam optimizer
              loss='categorical_crossentropy',  # Loss for multi-class classification
              metrics=['accuracy'])

print(model.summary())

# Step 3: Train the Model
history = model.fit(X_train_with_clusters, y_train,
                    validation_split=0.2,
                    epochs=50,  # Adjust epochs as needed
                    batch_size=16,  # Mini-batch size
                    verbose=1)

# Step 4: Evaluate the Model
loss, accuracy = model.evaluate(X_test_with_clusters, y_test)
print(f"\nTest Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

# Step 5: Visualize Training Progress
plt.figure(figsize=(12, 6))
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

plt.figure(figsize=(12, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Step 6: Analyze Results by Cluster
# Predict fairness categories for the test set
y_pred = np.argmax(model.predict(X_test_with_clusters), axis=1)

# Map predictions back to fairness categories
fairness_pred = pd.Series(y_pred).map({0: 'Fairly Paid', 1: 'Underpaid', 2: 'Overpaid'})

# Add predictions and clusters to a summary dataframe
results_df = pd.DataFrame({
    'Cluster': X_test_df['Cluster'],
    'True Fairness': pd.Series(np.argmax(y_test, axis=1)).map({0: 'Fairly Paid', 1: 'Underpaid', 2: 'Overpaid'}),
    'Predicted Fairness': fairness_pred
})

# Analyze fairness distribution by cluster
fairness_by_cluster = results_df.groupby('Cluster')['Predicted Fairness'].value_counts(normalize=True).unstack()
print(fairness_by_cluster)

# Visualize fairness distribution by cluster
fairness_by_cluster.plot(kind='bar', stacked=True, figsize=(12, 6), colormap='viridis')
plt.title('Fairness Prediction Distribution by Cluster')
plt.xlabel('Player Cluster')
plt.ylabel('Proportion')
plt.legend(title='Fairness Category')
plt.tight_layout()
plt.show()

/usr/local/lib/python3.10/dist-packages/keras/src/layers/core/dense.py:87: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)

Model: "sequential_1"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓
┃ Layer (type)                         ┃ Output Shape                ┃         Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩
│ dense_4 (Dense)                      │ (None, 64)                  │             576 │
├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
│ dropout_2 (Dropout)                  │ (None, 64)                  │               0 │
├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
│ dense_5 (Dense)                      │ (None, 32)                  │           2,080 │
├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
│ dropout_3 (Dropout)                  │ (None, 32)                  │               0 │
├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
│ dense_6 (Dense)                      │ (None, 16)                  │             528 │
├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
│ dense_7 (Dense)                      │ (None, 3)                   │              51 │
└──────────────────────────────────────┴─────────────────────────────┴─────────────────┘

 Total params: 3,235 (12.64 KB)

 Trainable params: 3,235 (12.64 KB)

 Non-trainable params: 0 (0.00 B)

None
Epoch 1/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 4s 43ms/step - accuracy: 0.2246 - loss: 1.1693 - val_accuracy: 0.5217 - val_loss: 1.0670
Epoch 2/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step - accuracy: 0.4648 - loss: 1.0619 - val_accuracy: 0.5652 - val_loss: 1.0218
Epoch 3/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 10ms/step - accuracy: 0.5145 - loss: 1.0363 - val_accuracy: 0.5507 - val_loss: 0.9828
Epoch 4/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 12ms/step - accuracy: 0.5609 - loss: 1.0067 - val_accuracy: 0.5652 - val_loss: 0.9460
Epoch 5/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step - accuracy: 0.5792 - loss: 0.9247 - val_accuracy: 0.5652 - val_loss: 0.9179
Epoch 6/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 11ms/step - accuracy: 0.6251 - loss: 0.8729 - val_accuracy: 0.5797 - val_loss: 0.9050
Epoch 7/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 11ms/step - accuracy: 0.5675 - loss: 0.8712 - val_accuracy: 0.5797 - val_loss: 0.8876
Epoch 8/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 10ms/step - accuracy: 0.6938 - loss: 0.7504 - val_accuracy: 0.5942 - val_loss: 0.8745
Epoch 9/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - accuracy: 0.5608 - loss: 0.8675 - val_accuracy: 0.6087 - val_loss: 0.8453
Epoch 10/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 1s 12ms/step - accuracy: 0.6494 - loss: 0.8256 - val_accuracy: 0.6522 - val_loss: 0.8230
Epoch 11/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step - accuracy: 0.6412 - loss: 0.8263 - val_accuracy: 0.6232 - val_loss: 0.7975
Epoch 12/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6405 - loss: 0.7704 - val_accuracy: 0.6377 - val_loss: 0.7743
Epoch 13/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6179 - loss: 0.7574 - val_accuracy: 0.6377 - val_loss: 0.7590
Epoch 14/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6185 - loss: 0.7804 - val_accuracy: 0.6522 - val_loss: 0.7559
Epoch 15/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.7186 - loss: 0.7183 - val_accuracy: 0.6522 - val_loss: 0.7604
Epoch 16/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6558 - loss: 0.7328 - val_accuracy: 0.6377 - val_loss: 0.7365
Epoch 17/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.6746 - loss: 0.7809 - val_accuracy: 0.6377 - val_loss: 0.7204
Epoch 18/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.7195 - loss: 0.6682 - val_accuracy: 0.6377 - val_loss: 0.7254
Epoch 19/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6132 - loss: 0.7899 - val_accuracy: 0.6377 - val_loss: 0.7142
Epoch 20/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6123 - loss: 0.7903 - val_accuracy: 0.6377 - val_loss: 0.7098
Epoch 21/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.6870 - loss: 0.7149 - val_accuracy: 0.6522 - val_loss: 0.7213
Epoch 22/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.7263 - loss: 0.6558 - val_accuracy: 0.6522 - val_loss: 0.7162
Epoch 23/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6732 - loss: 0.7611 - val_accuracy: 0.6232 - val_loss: 0.6950
Epoch 24/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6790 - loss: 0.7092 - val_accuracy: 0.6377 - val_loss: 0.6889
Epoch 25/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.7029 - loss: 0.6892 - val_accuracy: 0.6377 - val_loss: 0.6879
Epoch 26/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - accuracy: 0.7311 - loss: 0.6062 - val_accuracy: 0.6377 - val_loss: 0.6824
Epoch 27/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 1s 21ms/step - accuracy: 0.6925 - loss: 0.6339 - val_accuracy: 0.6377 - val_loss: 0.6810
Epoch 28/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 1s 21ms/step - accuracy: 0.7426 - loss: 0.6591 - val_accuracy: 0.6522 - val_loss: 0.6876
Epoch 29/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.7164 - loss: 0.6015 - val_accuracy: 0.6377 - val_loss: 0.6923
Epoch 30/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.6936 - loss: 0.6121 - val_accuracy: 0.6377 - val_loss: 0.6968
Epoch 31/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.6732 - loss: 0.6753 - val_accuracy: 0.6667 - val_loss: 0.6781
Epoch 32/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6719 - loss: 0.6605 - val_accuracy: 0.6667 - val_loss: 0.6685
Epoch 33/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.7308 - loss: 0.6109 - val_accuracy: 0.6667 - val_loss: 0.6671
Epoch 34/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.6786 - loss: 0.6823 - val_accuracy: 0.6667 - val_loss: 0.6625
Epoch 35/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step - accuracy: 0.6741 - loss: 0.6703 - val_accuracy: 0.6667 - val_loss: 0.6680
Epoch 36/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6687 - loss: 0.6631 - val_accuracy: 0.6377 - val_loss: 0.6789
Epoch 37/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.6559 - loss: 0.6474 - val_accuracy: 0.6377 - val_loss: 0.6729
Epoch 38/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step - accuracy: 0.7335 - loss: 0.6217 - val_accuracy: 0.6667 - val_loss: 0.6539
Epoch 39/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.7076 - loss: 0.6215 - val_accuracy: 0.6522 - val_loss: 0.6550
Epoch 40/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.7345 - loss: 0.6303 - val_accuracy: 0.6377 - val_loss: 0.6819
Epoch 41/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - accuracy: 0.7237 - loss: 0.6648 - val_accuracy: 0.6522 - val_loss: 0.6706
Epoch 42/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - accuracy: 0.7043 - loss: 0.6001 - val_accuracy: 0.6667 - val_loss: 0.6686
Epoch 43/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step - accuracy: 0.7683 - loss: 0.6141 - val_accuracy: 0.6522 - val_loss: 0.6635
Epoch 44/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 11ms/step - accuracy: 0.6912 - loss: 0.6185 - val_accuracy: 0.6522 - val_loss: 0.6581
Epoch 45/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.7393 - loss: 0.6437 - val_accuracy: 0.6522 - val_loss: 0.6593
Epoch 46/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.7281 - loss: 0.6000 - val_accuracy: 0.6232 - val_loss: 0.6618
Epoch 47/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.6727 - loss: 0.6101 - val_accuracy: 0.6377 - val_loss: 0.6695
Epoch 48/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.7014 - loss: 0.6247 - val_accuracy: 0.6377 - val_loss: 0.6630
Epoch 49/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.7559 - loss: 0.5823 - val_accuracy: 0.6667 - val_loss: 0.6585
Epoch 50/50
17/17 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - accuracy: 0.6760 - loss: 0.6398 - val_accuracy: 0.6522 - val_loss: 0.6477
3/3 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - accuracy: 0.7394 - loss: 0.6679 

Test Loss: 0.6681965589523315
Test Accuracy: 0.7209302186965942

3/3 ━━━━━━━━━━━━━━━━━━━━ 0s 40ms/step
Predicted Fairness  Fairly Paid  Overpaid  Underpaid
Cluster                                             
0                      0.391304  0.260870   0.347826
1                      0.545455  0.181818   0.272727
2                      0.390244  0.341463   0.268293

!cp "/content/drive/MyDrive/Colab Notebooks/final_project.ipynb" ./
!jupyter nbconvert --to html "final_project.ipynb"

[NbConvertApp] Converting notebook final_project.ipynb to html
[NbConvertApp] WARNING | Alternative text is missing on 18 image(s).
[NbConvertApp] Writing 1328985 bytes to final_project.html

Introduction¶

Data Sources¶

Packages¶

Step 1 Data Preprocessing¶

Merging data¶

EDA¶

Average Annual Value Distribution (Target)¶

Relationships between performance metrics and salary¶

Correlation Heatmap¶

Pairwise Scatterplot¶

Modeling¶

Penalized Regression with Ridge¶

Regression Model to Define Salary Fairness¶

Ensemble Methods for Determining Pay Equity¶

SVM to See which types of players are being paid fairly¶

Can it be Improved through Boosting and Bagging¶

Neural Networks for Further Analysis¶

Data Preparation¶

Model Preparation¶