%matplotlib inline

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster, set_link_color_palette
from sklearn.metrics import silhouette_score

# Load the dataset
file_path = '/content/pitch_type.csv'
data = pd.read_csv(file_path)

# Display initial structure for context
print("Initial Dataset Shape:", data.shape)
print(data.info())
print(data.head())

Initial Dataset Shape: (1106, 39)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1106 entries, 0 to 1105
Data columns (total 39 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   last_name, first_name   1106 non-null   object 
 1   player_id               1106 non-null   int64  
 2   year                    1106 non-null   int64  
 3   n_ff_formatted          1069 non-null   float64
 4   ff_avg_speed            1069 non-null   float64
 5   ff_avg_spin             1069 non-null   float64
 6   ff_avg_break_z_induced  1069 non-null   float64
 7   n_sl_formatted          818 non-null    float64
 8   sl_avg_speed            818 non-null    float64
 9   sl_avg_spin             814 non-null    float64
 10  sl_avg_break            818 non-null    float64
 11  n_ch_formatted          988 non-null    float64
 12  ch_avg_speed            988 non-null    float64
 13  ch_avg_spin             988 non-null    float64
 14  ch_avg_break            988 non-null    float64
 15  n_cu_formatted          954 non-null    float64
 16  cu_avg_speed            954 non-null    float64
 17  cu_avg_spin             954 non-null    float64
 18  cu_avg_break            954 non-null    float64
 19  n_si_formatted          918 non-null    float64
 20  si_avg_speed            918 non-null    float64
 21  si_avg_spin             918 non-null    float64
 22  si_avg_break            918 non-null    float64
 23  n_fc_formatted          517 non-null    float64
 24  fc_avg_speed            517 non-null    float64
 25  fc_avg_spin             517 non-null    float64
 26  fc_avg_break            517 non-null    float64
 27  n_fs_formatted          140 non-null    float64
 28  fs_avg_speed            140 non-null    float64
 29  fs_avg_spin             140 non-null    float64
 30  fs_avg_break            140 non-null    float64
 31  n_st_formatted          143 non-null    float64
 32  st_avg_speed            143 non-null    float64
 33  st_avg_spin             143 non-null    float64
 34  st_avg_break            143 non-null    float64
 35  n_sv_formatted          28 non-null     float64
 36  sv_avg_speed            28 non-null     float64
 37  sv_avg_spin             28 non-null     float64
 38  sv_avg_break            28 non-null     float64
dtypes: float64(36), int64(2), object(1)
memory usage: 337.1+ KB
None
  last_name, first_name  player_id  year  n_ff_formatted  ff_avg_speed  \
0        Colon, Bartolo     112526  2015            29.1          90.9   
1         Burnett, A.J.     150359  2015            11.7          91.7   
2           Hudson, Tim     218596  2015             7.0          88.5   
3         Buehrle, Mark     279824  2015            26.4          84.5   
4          Sabathia, CC     282332  2015            25.2          90.8   

   ff_avg_spin  ff_avg_break_z_induced  n_sl_formatted  sl_avg_speed  \
0       2255.0                    15.5             9.7          82.8   
1       2082.0                    12.0             NaN           NaN   
2       2126.0                    11.8             NaN           NaN   
3       2076.0                    12.1             NaN           NaN   
4       2114.0                    14.7            22.5          79.6   

   sl_avg_spin  ...  fs_avg_spin  fs_avg_break  n_st_formatted  st_avg_speed  \
0       2178.0  ...          NaN           NaN             NaN           NaN   
1          NaN  ...          NaN           NaN             NaN           NaN   
2          NaN  ...       1369.0          10.3             NaN           NaN   
3          NaN  ...          NaN           NaN             NaN           NaN   
4       1823.0  ...          NaN           NaN             NaN           NaN   

   st_avg_spin  st_avg_break  n_sv_formatted  sv_avg_speed  sv_avg_spin  \
0          NaN           NaN             NaN           NaN          NaN   
1          NaN           NaN             NaN           NaN          NaN   
2          NaN           NaN             NaN           NaN          NaN   
3          NaN           NaN             NaN           NaN          NaN   
4          NaN           NaN             NaN           NaN          NaN   

   sv_avg_break  
0           NaN  
1           NaN  
2           NaN  
3           NaN  
4           NaN  

[5 rows x 39 columns]

# Identify columns related to pitch usage
pitch_columns = [col for col in data.columns if 'n_' in col]

# Create binary flags for each pitch type
for col in pitch_columns:
    flag_col = col.replace('n_', 'has_')
    data[flag_col] = data[col].notna().astype(int)

# Display new columns with binary flags
print("Sample Data with Binary Flags:")
print(data[[col for col in data.columns if 'has_' in col]].head())

Sample Data with Binary Flags:
   has_ff_formatted  has_sl_formatted  has_ch_formatted  has_cu_formatted  \
0                 1                 1                 1                 1   
1                 1                 0                 1                 1   
2                 1                 0                 0                 1   
3                 1                 0                 1                 1   
4                 1                 1                 1                 0   

   has_si_formatted  has_fc_formatted  has_fs_formatted  has_st_formatted  \
0                 1                 0                 0                 0   
1                 1                 0                 0                 0   
2                 1                 1                 1                 0   
3                 1                 1                 0                 0   
4                 1                 1                 0                 0   

   has_sv_formatted  
0                 0  
1                 0  
2                 0  
3                 0  
4                 0

# Identify numeric columns for imputation
numeric_columns = [col for col in data.columns if data[col].dtype == 'float64']

# Replace missing values in numeric columns with 0
data[numeric_columns] = data[numeric_columns].fillna(0)

# Verify imputation
print("Sample Data after Missing Value Imputation:")
print(data[numeric_columns].head())

Sample Data after Missing Value Imputation:
   n_ff_formatted  ff_avg_speed  ff_avg_spin  ff_avg_break_z_induced  \
0            29.1          90.9       2255.0                    15.5   
1            11.7          91.7       2082.0                    12.0   
2             7.0          88.5       2126.0                    11.8   
3            26.4          84.5       2076.0                    12.1   
4            25.2          90.8       2114.0                    14.7   

   n_sl_formatted  sl_avg_speed  sl_avg_spin  sl_avg_break  n_ch_formatted  \
0             9.7          82.8       2178.0           6.3             7.4   
1             0.0           0.0          0.0           0.0             8.8   
2             0.0           0.0          0.0           0.0             0.0   
3             0.0           0.0          0.0           0.0            21.1   
4            22.5          79.6       1823.0          11.8            14.0   

   ch_avg_speed  ...  fs_avg_spin  fs_avg_break  n_st_formatted  st_avg_speed  \
0          82.6  ...          0.0           0.0             0.0           0.0   
1          86.3  ...          0.0           0.0             0.0           0.0   
2           0.0  ...       1369.0          10.3             0.0           0.0   
3          78.7  ...          0.0           0.0             0.0           0.0   
4          83.9  ...          0.0           0.0             0.0           0.0   

   st_avg_spin  st_avg_break  n_sv_formatted  sv_avg_speed  sv_avg_spin  \
0          0.0           0.0             0.0           0.0          0.0   
1          0.0           0.0             0.0           0.0          0.0   
2          0.0           0.0             0.0           0.0          0.0   
3          0.0           0.0             0.0           0.0          0.0   
4          0.0           0.0             0.0           0.0          0.0   

   sv_avg_break  
0           0.0  
1           0.0  
2           0.0  
3           0.0  
4           0.0  

[5 rows x 36 columns]

# Select columns for standardization (numeric and binary flags)
scaled_columns = numeric_columns + [col for col in data.columns if 'has_' in col]

# Standardize the selected columns
scaler = StandardScaler()
data[scaled_columns] = scaler.fit_transform(data[scaled_columns])

# Verify standardization
print("Sample Data after Standardization:")
print(data[scaled_columns].head())

Sample Data after Standardization:
   n_ff_formatted  ff_avg_speed  ff_avg_spin  ff_avg_break_z_induced  \
0       -0.211812      0.067336     0.193389                0.107785   
1       -1.223574      0.114787    -0.210085               -0.838111   
2       -1.496867     -0.075016    -0.107468               -0.892162   
3       -0.368809     -0.312270    -0.224079               -0.811085   
4       -0.438586      0.061405    -0.135454               -0.108420   

   n_sl_formatted  sl_avg_speed  sl_avg_spin  sl_avg_break  n_ch_formatted  \
0       -0.318350      0.540557     0.435467      0.165177       -0.509354   
1       -1.102058     -1.681349    -1.640706     -1.340870       -0.354382   
2       -1.102058     -1.681349    -1.640706     -1.340870       -1.328491   
3       -1.102058     -1.681349    -1.640706     -1.340870        1.007157   
4        0.715822      0.454687     0.097064      1.479980        0.221228   

   ch_avg_speed  ...  sv_avg_break  has_ff_formatted  has_sl_formatted  \
0      0.252418  ...     -0.156807          0.186042          0.593362   
1      0.392578  ...     -0.156807          0.186042         -1.685312   
2     -2.876543  ...     -0.156807          0.186042         -1.685312   
3      0.104683  ...     -0.156807          0.186042         -1.685312   
4      0.301663  ...     -0.156807          0.186042          0.593362   

   has_ch_formatted  has_cu_formatted  has_si_formatted  has_fc_formatted  \
0          0.345591          0.399161          0.452541         -0.936888   
1          0.345591          0.399161          0.452541         -0.936888   
2         -2.893593          0.399161          0.452541          1.067364   
3          0.345591          0.399161          0.452541          1.067364   
4          0.345591         -2.505258          0.452541          1.067364   

   has_fs_formatted  has_st_formatted  has_sv_formatted  
0         -0.380693          -0.38535         -0.161165  
1         -0.380693          -0.38535         -0.161165  
2          2.626785          -0.38535         -0.161165  
3         -0.380693          -0.38535         -0.161165  
4         -0.380693          -0.38535         -0.161165  

[5 rows x 45 columns]

print("Final Cleaned Dataset Shape:", data.shape)
print(data.head())

Final Cleaned Dataset Shape: (1106, 48)
  last_name, first_name  player_id  year  n_ff_formatted  ff_avg_speed  \
0        Colon, Bartolo     112526  2015       -0.211812      0.067336   
1         Burnett, A.J.     150359  2015       -1.223574      0.114787   
2           Hudson, Tim     218596  2015       -1.496867     -0.075016   
3         Buehrle, Mark     279824  2015       -0.368809     -0.312270   
4          Sabathia, CC     282332  2015       -0.438586      0.061405   

   ff_avg_spin  ff_avg_break_z_induced  n_sl_formatted  sl_avg_speed  \
0     0.193389                0.107785       -0.318350      0.540557   
1    -0.210085               -0.838111       -1.102058     -1.681349   
2    -0.107468               -0.892162       -1.102058     -1.681349   
3    -0.224079               -0.811085       -1.102058     -1.681349   
4    -0.135454               -0.108420        0.715822      0.454687   

   sl_avg_spin  ...  sv_avg_break  has_ff_formatted  has_sl_formatted  \
0     0.435467  ...     -0.156807          0.186042          0.593362   
1    -1.640706  ...     -0.156807          0.186042         -1.685312   
2    -1.640706  ...     -0.156807          0.186042         -1.685312   
3    -1.640706  ...     -0.156807          0.186042         -1.685312   
4     0.097064  ...     -0.156807          0.186042          0.593362   

   has_ch_formatted  has_cu_formatted  has_si_formatted  has_fc_formatted  \
0          0.345591          0.399161          0.452541         -0.936888   
1          0.345591          0.399161          0.452541         -0.936888   
2         -2.893593          0.399161          0.452541          1.067364   
3          0.345591          0.399161          0.452541          1.067364   
4          0.345591         -2.505258          0.452541          1.067364   

   has_fs_formatted  has_st_formatted  has_sv_formatted  
0         -0.380693          -0.38535         -0.161165  
1         -0.380693          -0.38535         -0.161165  
2          2.626785          -0.38535         -0.161165  
3         -0.380693          -0.38535         -0.161165  
4         -0.380693          -0.38535         -0.161165  

[5 rows x 48 columns]

# Select only numeric columns (including binary flags)
pca_columns = [col for col in data.columns if col.startswith(('ff_', 'sl_', 'cu_', 'si_', 'ch_', 'has_'))]

# Apply PCA
pca = PCA()
pca_result = pca.fit_transform(data[pca_columns])

# Explained variance
explained_variance = np.cumsum(pca.explained_variance_ratio_)

# Visualize the explained variance
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(explained_variance) + 1), explained_variance, marker='o', linestyle='--')
plt.title('Explained Variance by Principal Components')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid()
plt.show()

print("Feature contributions to PC2:")
print(components.loc['PC2'].sort_values(ascending=False).head())

Feature contributions to PC2:
ff_avg_spin               0.304537
ff_avg_speed              0.292026
ff_avg_break_z_induced    0.291926
has_ff_formatted          0.287610
ch_avg_break              0.241365
Name: PC2, dtype: float64

# Display contributions for all PCs
# Define the number of components based on explained variance \
num_components = len(pca.components_)

for i in range(num_components):
    print(f"Feature contributions to PC{i+1}:")
    print(components.loc[f'PC{i+1}'].sort_values(ascending=False).head())

Feature contributions to PC1:
has_ch_formatted    0.373448
ch_avg_speed        0.372151
ch_avg_spin         0.358590
ch_avg_break        0.346982
has_si_formatted    0.253067
Name: PC1, dtype: float64
Feature contributions to PC2:
ff_avg_spin               0.304537
ff_avg_speed              0.292026
ff_avg_break_z_induced    0.291926
has_ff_formatted          0.287610
ch_avg_break              0.241365
Name: PC2, dtype: float64
Feature contributions to PC3:
has_cu_formatted    0.351846
cu_avg_spin         0.349911
cu_avg_speed        0.348376
cu_avg_break        0.322361
has_fc_formatted    0.164561
Name: PC3, dtype: float64
Feature contributions to PC4:
si_avg_spin         0.331663
si_avg_break        0.328058
si_avg_speed        0.326781
has_si_formatted    0.324357
sl_avg_speed        0.252549
Name: PC4, dtype: float64
Feature contributions to PC5:
has_ff_formatted          0.359402
ff_avg_speed              0.354541
ff_avg_spin               0.329061
ff_avg_break_z_induced    0.245096
si_avg_break              0.137768
Name: PC5, dtype: float64
Feature contributions to PC6:
has_st_formatted    0.714181
has_fc_formatted    0.565005
sl_avg_break        0.093005
cu_avg_break        0.071189
ch_avg_spin         0.065638
Name: PC6, dtype: float64
Feature contributions to PC7:
has_sv_formatted    0.898582
has_st_formatted    0.391422
cu_avg_break        0.127061
sl_avg_break        0.068204
sl_avg_spin         0.061492
Name: PC7, dtype: float64
Feature contributions to PC8:
has_fc_formatted    0.760846
has_sv_formatted    0.201797
sl_avg_break        0.195804
has_fs_formatted    0.118418
ch_avg_spin         0.054058
Name: PC8, dtype: float64
Feature contributions to PC9:
has_fs_formatted          0.815169
ch_avg_break              0.306076
ff_avg_break_z_induced    0.275782
ch_avg_spin               0.193832
cu_avg_break              0.128117
Name: PC9, dtype: float64
Feature contributions to PC10:
cu_avg_break        0.657918
sl_avg_break        0.344856
has_ff_formatted    0.128736
ff_avg_speed        0.097633
ff_avg_spin         0.088544
Name: PC10, dtype: float64
Feature contributions to PC11:
ff_avg_break_z_induced    0.627330
sl_avg_break              0.408757
cu_avg_break              0.184616
si_avg_break              0.174545
ch_avg_break              0.065785
Name: PC11, dtype: float64
Feature contributions to PC12:
sl_avg_break        0.671894
cu_avg_speed        0.220477
has_cu_formatted    0.173505
has_fs_formatted    0.108697
has_ff_formatted    0.087315
Name: PC12, dtype: float64
Feature contributions to PC13:
ch_avg_break    0.501528
ch_avg_spin     0.385547
ff_avg_spin     0.236594
cu_avg_spin     0.107025
sl_avg_spin     0.092937
Name: PC13, dtype: float64
Feature contributions to PC14:
ff_avg_spin    0.477263
ch_avg_spin    0.294721
si_avg_spin    0.290919
cu_avg_spin    0.232700
sl_avg_spin    0.182240
Name: PC14, dtype: float64
Feature contributions to PC15:
ch_avg_spin               0.660581
ff_avg_break_z_induced    0.148038
sl_avg_speed              0.139365
has_ff_formatted          0.136807
has_cu_formatted          0.128818
Name: PC15, dtype: float64
Feature contributions to PC16:
cu_avg_spin         0.594090
sl_avg_spin         0.340587
si_avg_break        0.199052
has_ff_formatted    0.188391
ff_avg_speed        0.172436
Name: PC16, dtype: float64
Feature contributions to PC17:
si_avg_break        0.709988
ff_avg_spin         0.202035
ch_avg_spin         0.186809
has_sl_formatted    0.133405
sl_avg_speed        0.131780
Name: PC17, dtype: float64
Feature contributions to PC18:
sl_avg_spin         0.624594
has_cu_formatted    0.254985
si_avg_break        0.231020
cu_avg_speed        0.149822
cu_avg_break        0.088291
Name: PC18, dtype: float64
Feature contributions to PC19:
ff_avg_speed    0.490108
cu_avg_speed    0.340608
ch_avg_speed    0.303088
si_avg_speed    0.211675
sl_avg_speed    0.097424
Name: PC19, dtype: float64
Feature contributions to PC20:
si_avg_spin         0.764599
has_ff_formatted    0.190109
ff_avg_speed        0.098559
ch_avg_speed        0.059756
has_cu_formatted    0.046594
Name: PC20, dtype: float64
Feature contributions to PC21:
cu_avg_speed        0.608264
has_ff_formatted    0.338904
has_ch_formatted    0.094710
has_si_formatted    0.075140
sl_avg_speed        0.048278
Name: PC21, dtype: float64
Feature contributions to PC22:
ch_avg_speed        0.622794
has_ff_formatted    0.296005
has_si_formatted    0.094704
has_cu_formatted    0.089103
si_avg_break        0.035498
Name: PC22, dtype: float64
Feature contributions to PC23:
has_sl_formatted    0.698334
ff_avg_speed        0.124128
cu_avg_speed        0.110215
ch_avg_speed        0.043043
si_avg_spin         0.038070
Name: PC23, dtype: float64
Feature contributions to PC24:
si_avg_speed        0.690029
has_ff_formatted    0.203760
has_sl_formatted    0.071871
has_ch_formatted    0.043950
ff_avg_spin         0.024593
Name: PC24, dtype: float64

# Drop unnecessary columns
data_preprocessed = data.drop(columns=['player_id', 'year'], errors='ignore')

# Select only numeric columns
numeric_columns = data_preprocessed.select_dtypes(include=['float64', 'int64']).columns
data_numeric = data_preprocessed[numeric_columns]

# Scale the numeric data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_numeric)

# Display the scaled data
print("Scaled Data Shape:", data_scaled.shape)
print("Sample of Scaled Data:")
print(data_scaled[:5])

Scaled Data Shape: (1106, 45)
Sample of Scaled Data:
[[-2.11811731e-01  6.73362924e-02  1.93388597e-01  1.07784706e-01
  -3.18350034e-01  5.40557305e-01  4.35466804e-01  1.65177097e-01
  -5.09353553e-01  2.52418250e-01  2.81334731e-01  3.62595619e-01
  -1.21385113e+00  5.21363108e-01 -3.97349215e-01  2.14584159e-01
   2.04698190e+00  3.21247601e-01  3.93041929e-01  3.31132291e-01
  -6.68806664e-01 -9.36237137e-01 -9.31095654e-01 -8.67091784e-01
  -3.11510237e-01 -3.80523474e-01 -3.72628380e-01 -3.71057661e-01
  -2.90245005e-01 -3.84982900e-01 -3.83559141e-01 -3.75373683e-01
  -1.38388765e-01 -1.61064310e-01 -1.60011322e-01 -1.56807243e-01
   1.86042433e-01  5.93361812e-01  3.45591086e-01  3.99160545e-01
   4.52540637e-01 -9.36887887e-01 -3.80693494e-01 -3.85349567e-01
  -1.61164593e-01]
 [-1.22357446e+00  1.14787034e-01 -2.10085260e-01 -8.38110999e-01
  -1.10205843e+00 -1.68134860e+00 -1.64070579e+00 -1.34086972e+00
  -3.54381644e-01  3.92577543e-01  1.97653402e-01 -4.29528335e-01
   1.91681848e+00  5.50887386e-01 -9.58971254e-02  1.50734259e-01
   1.84751802e+00  4.27731647e-01  2.49191715e-01 -3.49732940e-02
  -6.68806664e-01 -9.36237137e-01 -9.31095654e-01 -8.67091784e-01
  -3.11510237e-01 -3.80523474e-01 -3.72628380e-01 -3.71057661e-01
  -2.90245005e-01 -3.84982900e-01 -3.83559141e-01 -3.75373683e-01
  -1.38388765e-01 -1.61064310e-01 -1.60011322e-01 -1.56807243e-01
   1.86042433e-01 -1.68531237e+00  3.45591086e-01  3.99160545e-01
   4.52540637e-01 -9.36887887e-01 -3.80693494e-01 -3.85349567e-01
  -1.61164593e-01]
 [-1.49686669e+00 -7.50159333e-02 -1.07467631e-01 -8.92162183e-01
  -1.10205843e+00 -1.68134860e+00 -1.64070579e+00 -1.34086972e+00
  -1.32849079e+00 -2.87654328e+00 -2.66800516e+00 -2.58986639e+00
   1.83844620e-03  3.11002629e-01  1.73703423e-01  7.89233264e-01
   1.68325365e+00  3.47149126e-01  2.89421012e-01 -9.12972301e-02
   1.49261794e+00  9.44407261e-01  8.74190001e-01  6.47033123e-02
   1.43444678e+00  2.45381493e+00  2.34939449e+00  1.88010848e+00
  -2.90245005e-01 -3.84982900e-01 -3.83559141e-01 -3.75373683e-01
  -1.38388765e-01 -1.61064310e-01 -1.60011322e-01 -1.56807243e-01
   1.86042433e-01 -1.68531237e+00 -2.89359316e+00  3.99160545e-01
   4.52540637e-01  1.06736357e+00  2.62678511e+00 -3.85349567e-01
  -1.61164593e-01]
 [-3.68809395e-01 -3.12269643e-01 -2.24078573e-01 -8.11085408e-01
  -1.10205843e+00 -1.68134860e+00 -1.64070579e+00 -1.34086972e+00
   1.00715728e+00  1.04682778e-01  1.07140945e-01  3.26589985e-01
  -3.20910998e-01  1.59690705e-01  3.37841509e-02  7.07198265e-03
   3.45672328e-01  1.97495873e-01  2.26029393e-01  7.76745783e-02
   1.17978017e+00  8.74166325e-01  7.06715833e-01  7.16959879e-01
  -3.11510237e-01 -3.80523474e-01 -3.72628380e-01 -3.71057661e-01
  -2.90245005e-01 -3.84982900e-01 -3.83559141e-01 -3.75373683e-01
  -1.38388765e-01 -1.61064310e-01 -1.60011322e-01 -1.56807243e-01
   1.86042433e-01 -1.68531237e+00  3.45591086e-01  3.99160545e-01
   4.52540637e-01  1.06736357e+00 -3.80693494e-01 -3.85349567e-01
  -1.61164593e-01]
 [-4.38586135e-01  6.14049497e-02 -1.35454257e-01 -1.08420027e-01
   7.15821875e-01  4.54686545e-01  9.70640172e-02  1.47997987e+00
   2.21228306e-01  3.01663407e-01  6.62170165e-01  1.10556179e-01
  -1.24612607e+00 -2.48642269e+00 -2.39717100e+00 -1.89246256e+00
   1.15526102e+00  3.78806545e-01  3.18678683e-01  5.84590004e-01
  -6.68806664e-01  1.10754750e+00  7.69411291e-01  2.72963729e+00
  -3.11510237e-01 -3.80523474e-01 -3.72628380e-01 -3.71057661e-01
  -2.90245005e-01 -3.84982900e-01 -3.83559141e-01 -3.75373683e-01
  -1.38388765e-01 -1.61064310e-01 -1.60011322e-01 -1.56807243e-01
   1.86042433e-01  5.93361812e-01  3.45591086e-01 -2.50525763e+00
   4.52540637e-01  1.06736357e+00 -3.80693494e-01 -3.85349567e-01
  -1.61164593e-01]]

# Find optimal k using the elbow method
inertia = []
k_values = range(1, 11)
for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(data_scaled)
    inertia.append(kmeans.inertia_)

# Plot the elbow curve
plt.figure(figsize=(8, 5))
plt.plot(k_values, inertia, marker='o', linestyle='--')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.grid()
plt.show()

# Perform K-Means with the chosen k
optimal_k = 4
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
kmeans_labels = kmeans.fit_predict(data_scaled)

# Add cluster labels to the original dataset
data_preprocessed['Cluster'] = kmeans_labels
print("Cluster Labels Assigned:")
print(data_preprocessed[['Cluster']].head())

Cluster Labels Assigned:
   Cluster
0        1
1        3
2        2
3        3
4        1

# Compute the linkage matrix
linkage_matrix = linkage(data_scaled, method='ward')

# Set a custom color palette for cluster branches
set_link_color_palette(['orange', 'blue', 'green', 'red', 'purple'])

# Plot the dendrogram with custom colors and simplified labels
plt.figure(figsize=(12, 8))
dendrogram(
    linkage_matrix,
    color_threshold=40,  # Threshold for coloring clusters
    above_threshold_color='grey',  # Color for branches above the threshold
    truncate_mode='level',  # Show only the top levels of the tree
    p=5,  # Display the top 5 levels of the hierarchy
)
plt.axhline(y=40, color='red', linestyle='--', label='Cluster Cut Threshold')  # Add a horizontal threshold line
plt.title('Cluster Threshold with Simplified Labels')
plt.xlabel('Data Points (Truncated)')
plt.ylabel('Euclidean Distance')
plt.legend()
plt.grid()
plt.show()

# Calculate silhouette scores for K-Means and Hierarchical clustering
silhouette_kmeans = silhouette_score(data_scaled, kmeans_labels)
silhouette_hierarchical = silhouette_score(data_scaled, hierarchical_labels)

print(f"Silhouette Score for K-Means: {silhouette_kmeans}")
print(f"Silhouette Score for Hierarchical Clustering: {silhouette_hierarchical}")

Silhouette Score for K-Means: 0.23400809694736652
Silhouette Score for Hierarchical Clustering: 0.35425332275556287

# Visualize K-Means clusters
plt.figure(figsize=(8, 6))
sns.scatterplot(x=data_scaled[:, 0], y=data_scaled[:, 1], hue=kmeans_labels, palette='viridis')
plt.title('K-Means Clustering Results')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend(title='Cluster')
plt.show()

# Visualize Hierarchical clusters
plt.figure(figsize=(8, 6))
sns.scatterplot(x=data_scaled[:, 0], y=data_scaled[:, 1], hue=hierarchical_labels, palette='viridis')
plt.title('Hierarchical Clustering Results')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend(title='Cluster')
plt.show()

# Add hierarchical cluster labels to the dataset (if not already done)
data_preprocessed['Hierarchical_Cluster'] = hierarchical_labels

# Use these cluster labels for aggregation and visualization
cluster_column = 'Hierarchical_Cluster'

# Select numeric columns for analysis
numeric_columns = data_preprocessed.select_dtypes(include=['float64', 'int64']).columns

# Summarize each cluster by calculating the mean of numeric metrics
cluster_summary = data_preprocessed.groupby('Hierarchical_Cluster')[numeric_columns].mean()

# Display the summarized results
print("Cluster Summary:")
print(cluster_summary)

# Save for further inspection
cluster_summary.to_csv('hierarchical_cluster_summary.csv', index=True)

Cluster Summary:
                      n_ff_formatted  ff_avg_speed  ff_avg_spin  \
Hierarchical_Cluster                                              
1                           0.021327      0.206482     0.159335   
2                          -0.236109      0.183633     0.246863   
3                          -1.903898     -5.324254    -5.065765   
4                           0.082753      0.180623     0.175744   

                      ff_avg_break_z_induced  n_sl_formatted  sl_avg_speed  \
Hierarchical_Cluster                                                         
1                                   0.254782        0.023007      0.114032   
2                                   0.109715       -0.885356     -0.772709   
3                                  -4.081182       -0.191481     -0.300164   
4                                   0.123431        0.031881      0.017766   

                      sl_avg_spin  sl_avg_break  n_ch_formatted  ch_avg_speed  \
Hierarchical_Cluster                                                            
1                        0.104619      0.083446       -1.156690     -2.021971   
2                       -0.774306     -0.712496       -0.006881     -0.117597   
3                       -0.275192     -0.185006        0.296919     -0.107041   
4                        0.018342      0.016176        0.179616      0.343231   

                      ...  sv_avg_break  has_ff_formatted  has_sl_formatted  \
Hierarchical_Cluster  ...                                                     
1                     ...     -0.156807          0.186042          0.116072   
2                     ...      6.037079          0.186042         -0.790119   
3                     ...     -0.156807         -5.375118         -0.268839   
4                     ...     -0.156807          0.186042          0.016676   

                      has_ch_formatted  has_cu_formatted  has_si_formatted  \
Hierarchical_Cluster                                                         
1                            -2.018138         -0.071826         -0.213031   
2                            -0.117150         -0.638132          0.072214   
3                            -0.092137         -0.385817          0.452541   
4                             0.341964          0.047898          0.014292   

                      has_fc_formatted  has_fs_formatted  has_st_formatted  \
Hierarchical_Cluster                                                         
1                             0.295456          2.301652          0.218785   
2                            -0.292664          0.156356         -0.172464   
3                             0.254829         -0.218127          0.017407   
4                            -0.050349         -0.377326         -0.031574   

                      has_sv_formatted  
Hierarchical_Cluster                    
1                            -0.161165  
2                             6.204837  
3                            -0.161165  
4                            -0.161165  

[4 rows x 45 columns]

# Identify pitch usage columns
pitch_columns = [col for col in data_preprocessed.columns if 'has_' in col]

# Summarize pitch usage by cluster
pitch_usage_summary = data_preprocessed.groupby('Hierarchical_Cluster')[pitch_columns].sum()

# Display the dominant pitch types for each cluster
print("Dominant Pitch Types by Cluster:")
print(pitch_usage_summary)

# Save the summary for inspection
pitch_usage_summary.to_csv('pitch_usage_summary.csv', index=True)

Dominant Pitch Types by Cluster:
                      has_ff_formatted  has_sl_formatted  has_ch_formatted  \
Hierarchical_Cluster                                                         
1                            27.534280         17.178649       -298.684417   
2                             5.209188        -22.123330         -3.280187   
3                          -198.879360         -9.947051         -3.409051   
4                           166.135892         14.891733        305.373655   

                      has_cu_formatted  has_si_formatted  has_fc_formatted  \
Hierarchical_Cluster                                                         
1                           -10.630276        -31.528602         43.727475   
2                           -17.867686          2.021990         -8.194598   
3                           -14.275242         16.744004          9.428680   
4                            42.773204         12.762609        -44.961558   

                      has_fs_formatted  has_st_formatted  has_sv_formatted  
Hierarchical_Cluster                                                        
1                           340.644538         32.380143        -23.852360  
2                             4.377975         -4.828996        173.735431  
3                            -8.070702          0.644046         -5.963090  
4                          -336.951811        -28.195192       -143.919981

# Melt the dataset for easier faceting
key_metrics = [col for col in data_preprocessed.columns if 'avg_' in col]  # All pitch-related metrics
melted_data = pd.melt(
    data_preprocessed,
    id_vars=['Hierarchical_Cluster'],
    value_vars=key_metrics,
    var_name='Metric',
    value_name='Value'
)

# Use Seaborn's FacetGrid to create facet-wrapped boxplots
g = sns.FacetGrid(melted_data, col="Metric", col_wrap=4, height=4, sharey=False)
g.map(sns.boxplot, 'Hierarchical_Cluster', 'Value', order=sorted(data_preprocessed['Hierarchical_Cluster'].unique()))
g.set_titles("{col_name}")
g.set_axis_labels("Cluster", "Value")
g.tight_layout()
plt.show()

# Define pitch groups
pitch_groups = {
    'Fastball': ['ff_avg_speed', 'ff_avg_spin', 'ff_avg_break_z_induced'],
    'Slider': ['sl_avg_speed', 'sl_avg_spin', 'sl_avg_break'],
    'Curveball': ['cu_avg_speed', 'cu_avg_spin', 'cu_avg_break'],
    'Changeup': ['ch_avg_speed', 'ch_avg_spin', 'ch_avg_break'],
    'Sinker': ['si_avg_speed', 'si_avg_spin', 'si_avg_break'],
    'Cutter': ['fc_avg_speed', 'fc_avg_spin', 'fc_avg_break'],
    'Splitter': ['fs_avg_speed', 'fs_avg_spin', 'fs_avg_break'],
    'Knuckleball': ['st_avg_speed', 'st_avg_spin', 'st_avg_break'],
    'Sweeper': ['sv_avg_speed', 'sv_avg_spin', 'sv_avg_break']
}

# Calculate the number of pitch groups
n_pitch_groups = len(pitch_groups)
n_cols = 2  # Number of columns in the grid
n_rows = (n_pitch_groups + 1) // n_cols  # Rows needed for the grid

# Create a grid of subplots
fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 4 * n_rows), constrained_layout=True)
axes = axes.flatten()  # Flatten the axes array for easy iteration

# Plot each pitch group
for i, (pitch_type, metrics) in enumerate(pitch_groups.items()):
    melted = pd.melt(data_preprocessed, id_vars='Hierarchical_Cluster', value_vars=metrics)
    sns.boxplot(x='Hierarchical_Cluster', y='value', hue='variable', data=melted, ax=axes[i])
    axes[i].set_title(f'{pitch_type} Metrics Across Clusters')
    axes[i].set_xlabel('Cluster')
    axes[i].set_ylabel('Value')
    axes[i].legend(title='Metric')
    axes[i].grid()

# Hide any unused subplots
for j in range(i + 1, len(axes)):
    axes[j].set_visible(False)

# Save the combined figure as an image
plt.savefig('pitch_metrics_across_clusters.png', dpi=300)
plt.show()

!cp "/content/drive/MyDrive/Colab Notebooks/pca_clustering_silverstein.ipynb" ./
!jupyter nbconvert --to html "pca_clustering_silversteinipynb"

[NbConvertApp] Converting notebook pca_clustering_silversteinipynb to html
[NbConvertApp] WARNING | Alternative text is missing on 7 image(s).
[NbConvertApp] Writing 1286567 bytes to pca_clustering_silversteinipyn.html

Grouping Pitchers by Pitch Arsenal Using PCA and Clustering¶

Packages¶

Data Cleanup¶

Basic info¶

Identify Pitch Usage Columns and Create Binary Flag¶

Handle Missing Values by Imputation¶

Standardize Numeric Data¶

Review dataset¶

PCA¶

Feature contributions¶

Retain Multiple PCs for Clustering:¶

Clustering¶

Prepare for Clustering¶

K- Means Clustering¶

Hierarchical Clustering¶

Evaluate cluster results¶

Visualize¶

Interpretable information¶

Identify dominant pitch types¶