%matplotlib inline


%%capture
from pandas.errors import PerformanceWarning
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.manifold import TSNE


# Silence warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
warnings.simplefilter("ignore", PerformanceWarning)


data = pd.read_csv("/content/fangraphs-leaderboards.csv", encoding='latin1')
data.head()


# Check for duplicate column names
duplicate_column_names = data.columns[data.columns.duplicated()].tolist()

# Display duplicate column names
print("Duplicate Column Names:", duplicate_column_names)

Duplicate Column Names: []


# Transpose the DataFrame and drop duplicates based on identical column values
duplicate_columns = data.T.duplicated()

# List the columns that are duplicated
repeated_columns = data.columns[duplicate_columns]

# Display the repeated columns
print("Repeated Columns:", repeated_columns)

Repeated Columns: Index(['Team.1', 'G.1', 'PA.1', 'HR.1', 'R.1', 'RBI.1', 'SB.1', 'AVG.1',
       'BB%.1', 'K%.1',
       ...
       'wOppTeamV.4', 'wTeamV.4', 'wNetBatV.4', 'TG.4', 'Bats.4', 'FPTS.4',
       'FPTS/G.4', 'SPTS.4', 'SPTS/G.4', 'XBR.3'],
      dtype='object', length=1391)


# Transpose the DataFrame and check for duplicated columns based on identical content
duplicate_columns = data.T.duplicated()

# Get the names of the duplicated columns
repeated_columns = data.columns[duplicate_columns]

# Drop the duplicated columns from the DataFrame
data = data.drop(columns=repeated_columns)

# Display the remaining columns to verify that duplicates are removed
print(f"Deleted {len(repeated_columns)} duplicated columns.")

Deleted 1391 duplicated columns.


# Display a list of all column names
column_names = data.columns.tolist()

# Print the list of column names
print("List of columns in the DataFrame:")
print(column_names)

List of columns in the DataFrame:
['ï»¿Season', 'Name', 'Team', 'G', 'PA', 'HR', 'R', 'RBI', 'SB', 'BB%', 'K%', 'ISO', 'BABIP', 'AVG', 'OBP', 'SLG', 'wOBA', 'xwOBA', 'wRC+', 'BsR', 'Off', 'Def', 'WAR', 'AB', 'H', '1B', '2B', '3B', 'BB', 'IBB', 'SO', 'HBP', 'SF', 'SH', 'GDP', 'CS', 'GB', 'FB', 'LD', 'IFFB', 'Pitches', 'Balls', 'Strikes', 'IFH', 'BU', 'BUH', 'BB/K', 'OPS', 'GB/FB', 'LD%', 'GB%', 'FB%', 'IFFB%', 'HR/FB', 'IFH%', 'BUH%', 'wRAA', 'wRC', 'Bat', 'Fld', 'Rep', 'Pos', 'RAR', 'Dol', 'Spd', 'WPA', '-WPA', '+WPA', 'RE24', 'REW', 'pLI', 'phLI', 'PH', 'WPA/LI', 'Clutch', 'FB%.1', 'FBv', 'SL%', 'SLv', 'CT%', 'CTv', 'CB%', 'CBv', 'CH%', 'CHv', 'SF%', 'SFv', 'KN%', 'KNv', 'XX%', 'wFB', 'wSL', 'wCT', 'wCB', 'wCH', 'wSF', 'wKN', 'wFB/C', 'wSL/C', 'wCT/C', 'wCB/C', 'wCH/C', 'wSF/C', 'wKN/C', 'O-Swing%', 'Z-Swing%', 'Swing%', 'O-Contact%', 'Z-Contact%', 'Contact%', 'Zone%', 'F-Strike%', 'SwStr%', 'FA% (sc)', 'FC% (sc)', 'FS% (sc)', 'FO% (sc)', 'SI% (sc)', 'SL% (sc)', 'CU% (sc)', 'KC% (sc)', 'EP% (sc)', 'CH% (sc)', 'SC% (sc)', 'KN% (sc)', 'vFA (sc)', 'vFC (sc)', 'vFS (sc)', 'vFO (sc)', 'vSI (sc)', 'vSL (sc)', 'vCU (sc)', 'vKC (sc)', 'vEP (sc)', 'vCH (sc)', 'vSC (sc)', 'vKN (sc)', 'FA-X (sc)', 'FC-X (sc)', 'FS-X (sc)', 'FO-X (sc)', 'SI-X (sc)', 'SL-X (sc)', 'CU-X (sc)', 'KC-X (sc)', 'EP-X (sc)', 'CH-X (sc)', 'SC-X (sc)', 'KN-X (sc)', 'FA-Z (sc)', 'FC-Z (sc)', 'FS-Z (sc)', 'FO-Z (sc)', 'SI-Z (sc)', 'SL-Z (sc)', 'CU-Z (sc)', 'KC-Z (sc)', 'EP-Z (sc)', 'CH-Z (sc)', 'SC-Z (sc)', 'KN-Z (sc)', 'wFA (sc)', 'wFC (sc)', 'wFS (sc)', 'wFO (sc)', 'wSI (sc)', 'wSL (sc)', 'wCU (sc)', 'wKC (sc)', 'wEP (sc)', 'wCH (sc)', 'wSC (sc)', 'wKN (sc)', 'wFA/C (sc)', 'wFC/C (sc)', 'wFS/C (sc)', 'wFO/C (sc)', 'wSI/C (sc)', 'wSL/C (sc)', 'wCU/C (sc)', 'wKC/C (sc)', 'wEP/C (sc)', 'wCH/C (sc)', 'wSC/C (sc)', 'wKN/C (sc)', 'O-Swing% (sc)', 'Z-Swing% (sc)', 'Swing% (sc)', 'O-Contact% (sc)', 'Z-Contact% (sc)', 'Contact% (sc)', 'Zone% (sc)', 'Pace', 'wSB', 'UBR', 'Age Rng', 'Lg', 'wGDP', 'Pull%', 'Cent%', 'Oppo%', 'Soft%', 'Med%', 'Hard%', 'TTO%', 'CH% (pi)', 'CS% (pi)', 'CU% (pi)', 'FA% (pi)', 'FC% (pi)', 'FS% (pi)', 'KN% (pi)', 'SB% (pi)', 'SI% (pi)', 'SL% (pi)', 'XX% (pi)', 'vCH (pi)', 'vCS (pi)', 'vCU (pi)', 'vFA (pi)', 'vFC (pi)', 'vFS (pi)', 'vKN (pi)', 'vSB (pi)', 'vSI (pi)', 'vSL (pi)', 'vXX (pi)', 'CH-X (pi)', 'CS-X (pi)', 'CU-X (pi)', 'FA-X (pi)', 'FC-X (pi)', 'FS-X (pi)', 'KN-X (pi)', 'SB-X (pi)', 'SI-X (pi)', 'SL-X (pi)', 'XX-X (pi)', 'CH-Z (pi)', 'CS-Z (pi)', 'CU-Z (pi)', 'FA-Z (pi)', 'FC-Z (pi)', 'FS-Z (pi)', 'KN-Z (pi)', 'SB-Z (pi)', 'SI-Z (pi)', 'SL-Z (pi)', 'XX-Z (pi)', 'wCH (pi)', 'wCS (pi)', 'wCU (pi)', 'wFA (pi)', 'wFC (pi)', 'wFS (pi)', 'wKN (pi)', 'wSB (pi)', 'wSI (pi)', 'wSL (pi)', 'wXX (pi)', 'wCH/C (pi)', 'wCS/C (pi)', 'wCU/C (pi)', 'wFA/C (pi)', 'wFC/C (pi)', 'wFS/C (pi)', 'wKN/C (pi)', 'wSB/C (pi)', 'wSI/C (pi)', 'wSL/C (pi)', 'wXX/C (pi)', 'O-Swing% (pi)', 'Z-Swing% (pi)', 'Swing% (pi)', 'O-Contact% (pi)', 'Z-Contact% (pi)', 'Contact% (pi)', 'Zone% (pi)', 'FRM', 'AVG+', 'BB%+', 'K%+', 'OBP+', 'SLG+', 'ISO+', 'BABIP+', 'LD+%', 'GB%+', 'FB%+', 'HR/FB%+', 'Pull%+', 'Cent%+', 'Oppo%+', 'Soft%+', 'Med%+', 'Hard%+', 'EV', 'LA', 'Barrels', 'Barrel%', 'maxEV', 'HardHit', 'HardHit%', 'Events', 'CStr%', 'CSW%', 'L-WAR', 'PPTV', 'CPTV', 'BPTV', 'DSV', 'DGV', 'BTV', 'wPPTV', 'wCPTV', 'wBPTV', 'wDSV', 'wDGV', 'wBTV', 'EBV', 'ESV', 'wOppTeamV', 'wTeamV', 'wNetBatV', 'TG', 'Bats', 'XBR', 'Age', 'NameASCII', 'PlayerId', 'MLBAMID']


# Rename the column from 'ï»¿Season' to 'Season'
data.rename(columns={'ï»¿Season': 'Season'}, inplace=True)

# Verify the column renaming
print(data.columns)

Index(['Season', 'Name', 'Team', 'G', 'PA', 'HR', 'R', 'RBI', 'SB', 'BB%',
       ...
       'wOppTeamV', 'wTeamV', 'wNetBatV', 'TG', 'Bats', 'XBR', 'Age',
       'NameASCII', 'PlayerId', 'MLBAMID'],
      dtype='object', length=331)


# Check for missing values
data.isnull().sum()


# drop columns with more than 20% missing values
threshold = len(data) * 0.80

# Drop columns with more than 20% missing values
data = data.dropna(axis=1, thresh=threshold)

# Recalculate the percentage of missing values per column
missing_percentages = (data.isnull().sum() / len(data)) * 100

# Sort and display the top 15 columns with the highest percentage of missing values
top_15_null_columns = missing_percentages.sort_values(ascending=False).head(15)

# Display
print(top_15_null_columns)

phLI        18.400000
XX%         14.618182
XBR         10.327273
Fld          0.727273
UBR          0.218182
wGDP         0.218182
CH% (pi)     0.000000
FS% (pi)     0.000000
FC% (pi)     0.000000
FA% (pi)     0.000000
CU% (pi)     0.000000
Season       0.000000
TTO%         0.000000
SI% (pi)     0.000000
Med%         0.000000
dtype: float64


# Select only numeric columns (int and float) for skewness and imputation
numeric_columns = data.select_dtypes(include=['float64', 'int64'])

# Calculate skewness for numeric columns only
skewness = numeric_columns.skew()

# Define a function to impute based on skewness
def impute_based_on_skewness(df, skewness):
    for column in df.columns:
        if df[column].isnull().sum() > 0:
            if abs(skewness[column]) > 0.5:
                df[column] = df[column].fillna(df[column].median())
            else:  #
                df[column] = df[column].fillna(df[column].mean())

# Apply the function to impute missing values for the numeric data
impute_based_on_skewness(numeric_columns, skewness)

# Replace the numeric columns in the original dataframe with the imputed ones
data[numeric_columns.columns] = numeric_columns

# Check the result to see if missing values are imputed
print(data.isnull().sum())

Season       0
Name         0
Team         0
G            0
PA           0
            ..
XBR          0
Age          0
NameASCII    0
PlayerId     0
MLBAMID      0
Length: 264, dtype: int64


# Display only the columns with missing values
missing_columns = data.columns[data.isnull().any()]  # Get columns with missing values
missing_data = data[missing_columns].isnull().sum()  # Count missing values in those columns

# Display the columns with their missing value count
print(missing_data)

Series([], dtype: float64)


# Check the data types of the problematic columns
print(data[['Age Rng', 'Bats']].dtypes)
# Display the first few rows of the problematic columns
print(data[['Age Rng', 'Bats']].head(10))

Age Rng    object
Bats       object
dtype: object
   Age Rng Bats
0  23 - 23    L
1  30 - 30    L
2  22 - 22    L
3  24 - 24    R
4  21 - 21    B
5  23 - 23    R
6  27 - 27    L
7  30 - 30    B
8  25 - 25    R
9  27 - 27    B


# Split the Age Rng into Min Age and Max Age
data[['Min Age', 'Max Age']] = data['Age Rng'].str.split(' - ', expand=True)

# Convert the new columns to numeric
data['Min Age'] = pd.to_numeric(data['Min Age'], errors='coerce')
data['Max Age'] = pd.to_numeric(data['Max Age'], errors='coerce')


# Check for missing values in the dataset
missing_values = data.isnull().sum()

# Filter and display only columns with missing values
columns_with_missing_values = missing_values[missing_values > 0]

# Display the columns with missing values
print(columns_with_missing_values)

Series([], dtype: int64)


# Calculate skewness for Min Age and Max Age
min_age_skewness = data['Min Age'].skew()
max_age_skewness = data['Max Age'].skew()

print(f"Skewness of 'Min Age': {min_age_skewness:.2f}")
print(f"Skewness of 'Max Age': {max_age_skewness:.2f}")

# Plot histograms to visualize the distribution
plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
data['Min Age'].hist(bins=20)
plt.title('Min Age Distribution')

plt.subplot(1, 2, 2)
data['Max Age'].hist(bins=20)
plt.title('Max Age Distribution')

plt.tight_layout()
plt.show()

Skewness of 'Min Age': 0.46
Skewness of 'Max Age': 0.46


# Impute missing values in Min Age and Max Age using the mean
data['Min Age'].fillna(data['Min Age'].mean(), inplace=True)
data['Max Age'].fillna(data['Max Age'].mean(), inplace=True)

# Verify that there are no more missing values
print(data[['Min Age', 'Max Age']].isnull().sum())

Min Age    0
Max Age    0
dtype: int64


# Check the range of values in numeric columns
range_check = data.select_dtypes(include=['float64', 'int64']).agg(['min', 'max'])

# Display range check
print(range_check)

     Season    G   PA  HR    R  RBI  SB       BB%        K%       ISO  ...  \
min    2015   44  186   0   13   10   0  0.015306  0.043155  0.050710  ...   
max    2024  162  753  62  149  144  73  0.221713  0.439024  0.379249  ...   

     wOppTeamV    wTeamV  wNetBatV   TG        XBR  Age  PlayerId  MLBAMID  \
min   0.000000 -0.523237 -0.387861   58 -10.284589   20       393   116338   
max   0.748088  0.000000  0.681861  163   8.625678   41     33333   807799   

     Min Age  Max Age  
min       20       20  
max       41       41  

[2 rows x 261 columns]


# Step 1: Retain 'PlayerId', 'MLBAMID', 'Season', and 'Name' for later analysis (exclude them from PCA)
columns_to_exclude = ['PlayerId', 'MLBAMID', 'Season', 'Name']
features_for_pca = data.drop(columns=columns_to_exclude)

# Step 2: Select only numeric columns for PCA
numeric_features = features_for_pca.select_dtypes(include=['float64', 'int64'])

# Step 3: Standardize the features before applying PCA
scaler = StandardScaler()
scaled_features = scaler.fit_transform(numeric_features)

# Step 4: Apply PCA (retain 95% of variance)
pca = PCA(n_components=0.95)
pca_features = pca.fit_transform(scaled_features)

# Display how many components were retained after PCA
print(f"PCA reduced the dataset to {pca_features.shape[1]} components.")

# Step 5: Visualize the explained variance by each principal component
explained_variance = pca.explained_variance_ratio_

plt.figure(figsize=(8, 5))
plt.plot(range(1, len(explained_variance) + 1), explained_variance.cumsum(), marker='o', linestyle='--')
plt.title('Explained Variance by Principal Components')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid(True)
plt.show()

PCA reduced the dataset to 72 components.


# Step 1: Exclude non-performance columns AND 'Age', 'Min Age', 'Max Age' for clustering
columns_to_exclude = ['PlayerId', 'MLBAMID', 'Season', 'Name', 'Age', 'Min Age', 'Max Age', 'Pitches','Strikes','Balls', 'AB', 'PA']
features_for_kmeans = data.drop(columns=columns_to_exclude)

# Step 2: Initialize KMeans model (with 3 clusters, adjust based on your needs)
kmeans = KMeans(n_clusters=3, random_state=42)

# Step 3: Fit the KMeans model on PCA-transformed features
kmeans.fit(pca_features)

# Step 4: Get cluster labels for each player (these labels will be used as the target in SVM)
cluster_labels = kmeans.labels_

# Step 5: Retain 'PlayerId', 'MLBAMID', 'Season', and 'Name' for post-analysis (IDs and seasons)
ids_and_season = data[['PlayerId', 'MLBAMID', 'Season', 'Name']]

# Step 6: Combine the IDs, seasons, and cluster labels to review
final_result = pd.concat([ids_and_season.reset_index(drop=True), pd.DataFrame(cluster_labels, columns=['Cluster'])], axis=1)

# Step 7: View the final result to see which players belong to each cluster
print(final_result.head())

# Get the unique clusters and their counts
cluster_counts = pd.Series(cluster_labels).value_counts()

# Display the number of players in each cluster
print(cluster_counts)

# Display the total number of unique clusters
print(f"Total number of clusters: {len(cluster_counts)}")

   PlayerId  MLBAMID  Season                Name  Cluster
0     25878   682998    2024      Corbin Carroll        2
1      8203   543829    2018  Dee Strange-Gordon        0
2     26289   683002    2023    Gunnar Henderson        2
3     12161   593428    2017     Xander Bogaerts        0
4     16556   645277    2018        Ozzie Albies        0
1    491
0    444
2    440
Name: count, dtype: int64
Total number of clusters: 3


# Step 1: Calculate inertia (within-cluster sum of squares) for different cluster sizes
inertia_values = []
cluster_range = range(2, 11)  # Test cluster sizes from 2 to 10

for n_clusters in cluster_range:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(pca_features)
    inertia_values.append(kmeans.inertia_)

# Step 2: Plot the inertia values to find the "elbow"
plt.figure(figsize=(8, 5))
plt.plot(cluster_range, inertia_values, marker='o', linestyle='--')
plt.title('Elbow Method for Optimal Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia (Within-Cluster Sum of Squares)')
plt.grid(True)
plt.show()


# Step 1: Initialize KMeans with 4 clusters
optimal_n_clusters = 4
kmeans = KMeans(n_clusters=optimal_n_clusters, random_state=42)

# Step 2: Fit the KMeans model on PCA-transformed features (excluding age-related columns)
kmeans.fit(pca_features)

# Step 3: Get the new cluster labels
cluster_labels = kmeans.labels_

# Step 4: Retain 'PlayerId', 'MLBAMID', 'Season', and 'Name' for post-analysis (IDs and seasons)
ids_and_season = data[['PlayerId', 'MLBAMID', 'Season', 'Name']]

# Step 5: Combine the IDs, seasons, and cluster labels
final_result = pd.concat([ids_and_season.reset_index(drop=True), pd.DataFrame(cluster_labels, columns=['Cluster'])], axis=1)

# Step 6: View the final result
print(final_result.head())

# Get the unique clusters and their counts
cluster_counts = pd.Series(cluster_labels).value_counts()

# Display the number of players in each cluster
print(cluster_counts)

# Display the total number of unique clusters
print(f"Total number of clusters: {len(cluster_counts)}")

   PlayerId  MLBAMID  Season                Name  Cluster
0     25878   682998    2024      Corbin Carroll        3
1      8203   543829    2018  Dee Strange-Gordon        0
2     26289   683002    2023    Gunnar Henderson        3
3     12161   593428    2017     Xander Bogaerts        0
4     16556   645277    2018        Ozzie Albies        0
0    470
3    400
2    364
1    141
Name: count, dtype: int64
Total number of clusters: 4


# Step 1: Exclude non-performance columns (PlayerId, MLBAMID, Season, Name) for SVM
features_for_svm = data.drop(columns=columns_to_exclude)

# Step 2: Select only numeric columns for SVM
numeric_features = features_for_svm.select_dtypes(include=['float64', 'int64'])

# Step 3: Split the PCA-transformed numeric data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(pca_features, cluster_labels, test_size=0.3, random_state=42)

# Step 4: Initialize and train the SVM model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# Step 5: Evaluate the SVM model
print(f"Training Accuracy: {svm_model.score(X_train, y_train)}")
print(f"Test Accuracy: {svm_model.score(X_test, y_test)}")

# Step 6: Make predictions on the test set
predicted_labels = svm_model.predict(X_test)

# Step 7: Combine the IDs, seasons, and predicted labels for final analysis
final_result_svm = pd.concat([ids_and_season.reset_index(drop=True), pd.DataFrame(predicted_labels, columns=['Predicted Cluster'])], axis=1)

# Step 8: View the final result (showing predicted clusters)
print(final_result_svm.head())

Training Accuracy: 1.0
Test Accuracy: 0.9564164648910412
   PlayerId  MLBAMID  Season                Name  Predicted Cluster
0     25878   682998    2024      Corbin Carroll                3.0
1      8203   543829    2018  Dee Strange-Gordon                0.0
2     26289   683002    2023    Gunnar Henderson                2.0
3     12161   593428    2017     Xander Bogaerts                0.0
4     16556   645277    2018        Ozzie Albies                0.0


# Step 1: Exclude non-performance columns (PlayerId, MLBAMID, Season, Name) for SVM
features_for_svm = data.drop(columns=columns_to_exclude)

# Step 2: Select only numeric columns for SVM
numeric_features = features_for_svm.select_dtypes(include=['float64', 'int64'])

# Step 3: Perform cross-validation using the PCA-transformed features and cluster labels as target
svm_model = SVC(kernel='linear')

# Perform 5-fold cross-validation (using PCA-transformed features and cluster labels)
cv_scores = cross_val_score(svm_model, pca_features, cluster_labels, cv=5)

# Step 4: Display the cross-validation scores and the mean score (rounded to 3 decimal places)
cv_scores_rounded = [round(score, 3) for score in cv_scores]
mean_cv_score_rounded = round(cv_scores.mean(), 3)

# Step 5: Print the rounded scores
print(f"Cross-validation scores for each fold: {cv_scores_rounded}")
print(f"Mean cross-validation accuracy: {mean_cv_score_rounded}")

Cross-validation scores for each fold: [0.949, 0.964, 0.935, 0.96, 0.945]
Mean cross-validation accuracy: 0.951


# Impute NaNs with the median value of each column
numeric_columns = numeric_columns.fillna(numeric_columns.median())

# Check again for any remaining NaNs
print(numeric_columns.isnull().sum().sum())

0


# Apply t-SNE for 2D visualization of clusters
tsne = TSNE(n_components=2, random_state=42)
tsne_features = tsne.fit_transform(numeric_columns)

# Plot the t-SNE results
plt.figure(figsize=(8, 6))
plt.scatter(tsne_features[:, 0], tsne_features[:, 1], c=cluster_labels, cmap='viridis', s=50)
plt.title('t-SNE Visualization of Clusters')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.colorbar(label='Cluster')
plt.grid(True)

# Save the figure as a PNG file before displaying
plt.savefig("tsne_plot.png", format='png')

# Display the plot in the notebook
plt.show()


# Step 1: Add the predicted cluster labels from the SVM model to the original dataset
final_result_svm = pd.concat([ids_and_season.reset_index(drop=True), pd.DataFrame(predicted_labels, columns=['Predicted Cluster'])], axis=1)

# Step 2: Combine the predicted clusters with the original data
data_with_predicted_clusters = pd.concat([data, pd.DataFrame(predicted_labels, columns=['Predicted Cluster'])], axis=1)

# Step 3: Exclude non-performance columns like 'PlayerId', 'MLBAMID', 'Season', and 'Name'
performance_data = data_with_predicted_clusters.drop(columns=columns_to_exclude)

# Step 4: Select only numeric performance-related columns for analysis
numeric_columns = performance_data.select_dtypes(include=['float64', 'int64'])

# Step 5: Calculate the mean values of the numeric performance features for each predicted cluster
cluster_means_svm = numeric_columns.groupby('Predicted Cluster').mean()

# Step 6: Display the mean values of features for each predicted cluster
print(cluster_means_svm)

                            G         HR          R        RBI         SB  \
Predicted Cluster                                                           
0.0                142.500000  20.173913  82.557971  70.833333  16.318841   
1.0                143.218182  18.018182  78.654545  66.563636  15.145455   
2.0                142.982456  20.359649  82.026316  71.464912  16.052632   
3.0                137.735849  18.745283  77.754717  65.915094  13.716981   

                        BB%        K%       ISO     BABIP       AVG  ...  \
Predicted Cluster                                                    ...   
0.0                0.085231  0.191754  0.184516  0.312136  0.273484  ...   
1.0                0.079007  0.191835  0.170632  0.303551  0.264341  ...   
2.0                0.084487  0.204223  0.181110  0.310047  0.267394  ...   
3.0                0.083558  0.201764  0.178396  0.307451  0.265367  ...   

                       Events     CStr%      CSW%     L-WAR       DGV  \
Predicted Cluster                                                       
0.0                421.768116  0.163732  0.261157  3.694578  0.057971   
1.0                427.654545  0.163004  0.261697  3.128038  0.000000   
2.0                423.807018  0.160050  0.264392  3.428330  0.017544   
3.0                402.084906  0.165295  0.266392  3.173385  0.047170   

                   wOppTeamV    wTeamV  wNetBatV          TG       XBR  
Predicted Cluster                                                       
0.0                 0.039185 -0.006331  0.032854  156.811594  2.430008  
1.0                 0.009599 -0.004448  0.005151  158.218182  2.140175  
2.0                 0.044106 -0.005176  0.038931  158.324561  2.398534  
3.0                 0.035820 -0.003017  0.032803  152.339623  2.276904  

[4 rows x 250 columns]


# Calculate the mean for each cluster across all numeric features
cluster_means = numeric_columns.groupby(cluster_labels).mean()

# Calculate the variance for each cluster across all numeric features
cluster_variance = numeric_columns.groupby(cluster_labels).var()

# Display the top metrics with the highest variance across clusters
top_variance_metrics = cluster_variance.mean().sort_values(ascending=False).head(10)
print("Top 10 metrics with highest variance across clusters:")
print(top_variance_metrics)

# Display the means for top variance metrics
print("Cluster means for top variance metrics:")
print(cluster_means[top_variance_metrics.index])

Top 10 metrics with highest variance across clusters:
Events     2594.675240
HR/FB%+    1725.110894
BB%+       1203.723488
GB         1039.402947
FB          817.909615
HardHit     795.804251
ISO+        768.148936
SO          761.338795
K%+         599.628192
Soft%+      387.422831
dtype: float64
Cluster means for top variance metrics:
       Events     HR/FB%+        BB%+          GB          FB     HardHit  \
0  440.780851   80.703824   89.055797  197.259574  145.644681  145.542553   
1  151.418440  112.259210  104.759454   64.021277   54.134752   59.574468   
2  430.741758  145.403572  129.478573  171.250000  166.381868  186.648352   
3  402.565000  111.274168  101.098256  167.200000  153.812500  165.832500   

         ISO+          SO        K%+      Soft%+  
0   88.771715  103.010638  82.691111  100.873666  
1  111.819973   48.659574  92.867043   95.261669  
2  141.396042  128.060440  93.531665   86.066942  
3  110.258241  131.455000  99.133567   96.413571


# Visualize the means of top variance features using a bar chart
high_variance_features = top_variance_metrics.index  # Features with high variance

# Calculate means for these features by cluster
cluster_means_top_variance = cluster_means[high_variance_features]

# Plot the top variance features by cluster
cluster_means_top_variance.T.plot(kind='bar', figsize=(12, 8))
plt.title("Top-Variance Performance Metrics by Predicted Cluster")
plt.ylabel("Mean Value")
plt.xlabel("Performance Metrics")
plt.xticks(rotation=45)
plt.legend(title="Cluster")
plt.grid(True)
plt.tight_layout()
plt.show()


# Impute NaNs with the median value of each column
numeric_columns = numeric_columns.fillna(numeric_columns.median())

# Check again for any remaining NaNs
print(numeric_columns.isnull().sum().sum())

0


# Compare key metrics for Cluster 0 and Cluster 3
key_metrics = ['HR', 'ISO', 'SB', 'K%', 'BB%']

# cluster labels
cluster_labels_series = pd.Series(cluster_labels)

# Filter the data for Cluster 0 and Cluster 3
cluster_0_3_data = numeric_columns[cluster_labels_series.isin([0, 3])]

# Compare mean values of key metrics for both clusters
cluster_0_3_means = cluster_0_3_data.groupby(cluster_labels_series).mean()[key_metrics]

# Display the results
print(cluster_0_3_means)

          HR       ISO         SB        K%       BB%
0  15.148936  0.147338  10.804255  0.174129  0.074476
3  21.242500  0.179519   9.027500  0.223099  0.085378


# Assigning descriptive names to the clusters
cluster_names = {
    0: "Speedy Contact Hitters",
    1: "Defensive Specialists",
    2: "Power-Speed Hybrids",
    3: "Power Hitters"
}


# Function to look up a player and return their cluster information
def get_player_cluster(player_name):
    # Search for the player in the DataFrame
    player_info = final_result[final_result['Name'].str.contains(player_name, case=False, na=False)]

    # If player is found
    if not player_info.empty:
        cluster_label = player_info.iloc[0]['Cluster']  # Get the cluster label
        cluster_name = cluster_names[cluster_label]  # Get the cluster name from the dictionary
        return f"{player_name} belongs to '{cluster_name}' (Cluster {cluster_label})"
    else:
        return f"Player {player_name} not found in the dataset."

# Example usage:
print(get_player_cluster("Randy Arozarena"))

Randy Arozarena belongs to 'Power Hitters' (Cluster 3)


print(get_player_cluster("Francisco Lindor"))

Francisco Lindor belongs to 'Speedy Contact Hitters' (Cluster 0)


!cp "/content/drive/MyDrive/Colab Notebooks/silverstein_svm.ipynb" ./
!jupyter nbconvert --to html "silverstein_svm.ipynb"

[NbConvertApp] Converting notebook silverstein_svm.ipynb to html
[NbConvertApp] Writing 1074074 bytes to silverstein_svm.html

Clustering MLB Players into Performance Archetypes Using SVM and Feature Cutoffs¶

Packages¶

EDA¶

Missing Values¶

Data Information¶

Modeling¶

K-Means Clustering¶

SVM¶

Cluster Characteristics¶

Cluster names¶

	ï»¿Season	Name	Team	G	PA	HR	R	RBI	SB	BB%	...	TG.4	Bats.4	FPTS.4	FPTS/G.4	SPTS.4	SPTS/G.4	XBR.3	NameASCII	PlayerId	MLBAMID
0	2024	Corbin Carroll	ARI	158	684	22	121	74	35	0.106725	...	162	L	NaN	NaN	NaN	NaN	8.625678	Corbin Carroll	25878	682998
1	2018	Dee Strange-Gordon	SEA	141	588	4	62	36	30	0.015306	...	162	L	NaN	NaN	NaN	NaN	6.418709	Dee Strange-Gordon	8203	543829
2	2023	Gunnar Henderson	BAL	150	622	28	100	82	10	0.090032	...	162	L	NaN	NaN	NaN	NaN	6.097221	Gunnar Henderson	26289	683002
3	2017	Xander Bogaerts	BOS	148	635	10	94	62	15	0.088189	...	162	R	NaN	NaN	NaN	NaN	6.003955	Xander Bogaerts	12161	593428
4	2018	Ozzie Albies	ATL	158	684	24	105	72	14	0.052632	...	162	B	NaN	NaN	NaN	NaN	5.864576	Ozzie Albies	16556	645277

	0
Season	0
Name	0
Team	0
G	0
PA	0
...	...
XBR	142
Age	0
NameASCII	0
PlayerId	0
MLBAMID	0