Previous articles in this series:

In this brief article, we preprocess and save data to be used in constructing machine learning algorithms to predict wins and losses for NBA teams. The raw data, from Basketball Reference, that we read into the functions below can be found here.

1. Standard Scaler
We apply the standard scaler to data for each season separately. This is to explicitly model given features for a given team versus all other teams for a particular NBA season. The hope is that this will capture subtle, although maybe not drastic, shifts in styles of play over time. To handle possibly drastic shifts, we exclude data from before the 2008-2009 season.

import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler

def generate_ml_data_standard_scaler(data_dir, seasons_array_calib, season_prod):
        
    list_x_off_calib = []
    list_x_def_calib = []
    list_y_calib = []
    
    for season in seasons_array_calib:
        dftemp = pd.read_pickle(data_dir + 'dfteamsoff_' + season + '.pkl')
        list_x_off_calib.append(StandardScaler().fit_transform(dftemp.values))
        
        dftemp = pd.read_pickle(data_dir + 'dfteamsdef_' + season + '.pkl')
        list_x_def_calib.append(StandardScaler().fit_transform(dftemp.values))
        
        dftemp = pd.read_pickle(data_dir + 'dfwinloss_' + season + '.pkl')
        list_y_calib.append(dftemp['Win Fraction'].values)
        
    x_off_calib = np.vstack((list_x_off_calib))
    x_def_calib = np.vstack((list_x_def_calib))
    x_calib_temp = np.hstack(([x_off_calib, x_def_calib]))
    
    y_calib_temp = np.hstack((list_y_calib))
    
    df_x_temp = pd.DataFrame(data=x_calib_temp)
    df_y_temp = pd.DataFrame(data=y_calib_temp)    
    df_temp = pd.concat([df_x_temp, df_y_temp], axis=1)
    df_temp = shuffle(df_temp)
    
    xy_calib = df_temp.values
    x_calib = xy_calib[:,:-1]
    np.save(data_dir + 'x_calibration_standard_scaler.npy', x_calib)
    y_calib = xy_calib[:,-1]
    np.save(data_dir + 'y_calibration_standard_scaler.npy', y_calib)
        
    df_off_prod = pd.read_pickle(data_dir + 'dfteamsoff_' + season_prod + '.pkl')    
    df_def_prod = pd.read_pickle(data_dir + 'dfteamsdef_' + season_prod + '.pkl') 
    
    x_prod = np.hstack(([StandardScaler().fit_transform(df_off_prod.values), 
                         StandardScaler().fit_transform(df_def_prod.values)]))
    np.save(data_dir + 'x_production_standard_scaler.npy', x_prod)

2. MinMax Scaler
We do the same as above, but use the minmax scaler such that all data is in the interval (0.05, 0.95).

import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler

def generate_ml_data_minmax_scaler(data_dir, seasons_array_calib, season_prod):
        
    list_x_off_calib = []
    list_x_def_calib = []
    list_y_calib = []
    
    for season in seasons_array_calib:
        dftemp = pd.read_pickle(data_dir + 'dfteamsoff_' + season + '.pkl')
        list_x_off_calib.append(MinMaxScaler(feature_range=(0.05, 0.95)).fit_transform(dftemp.values))
        
        dftemp = pd.read_pickle(data_dir + 'dfteamsdef_' + season + '.pkl')
        list_x_def_calib.append(MinMaxScaler(feature_range=(0.05, 0.95)).fit_transform(dftemp.values))
        
        dftemp = pd.read_pickle(data_dir + 'dfwinloss_' + season + '.pkl')
        list_y_calib.append(dftemp['Win Fraction'].values)
        
    x_off_calib = np.vstack((list_x_off_calib))
    x_def_calib = np.vstack((list_x_def_calib))
    x_calib_temp = np.hstack(([x_off_calib, x_def_calib]))
    
    y_calib_temp = np.hstack((list_y_calib))
    
    df_x_temp = pd.DataFrame(data=x_calib_temp)
    df_y_temp = pd.DataFrame(data=y_calib_temp)    
    df_temp = pd.concat([df_x_temp, df_y_temp], axis=1)
    df_temp = shuffle(df_temp)
    
    xy_calib = df_temp.values
    x_calib = xy_calib[:,:-1]
    np.save(data_dir + 'x_calibration_minmax_scaler.npy', x_calib)
    y_calib = xy_calib[:,-1]
    np.save(data_dir + 'y_calibration_minmax_scaler.npy', y_calib)
        
    df_off_prod = pd.read_pickle(data_dir + 'dfteamsoff_' + season_prod + '.pkl')    
    df_def_prod = pd.read_pickle(data_dir + 'dfteamsdef_' + season_prod + '.pkl')
    
    x_prod = np.hstack(([MinMaxScaler(feature_range=(0.05, 0.95)).fit_transform(df_off_prod.values), 
                         MinMaxScaler(feature_range=(0.05, 0.95)).fit_transform(df_def_prod.values)]))
    np.save(data_dir + 'x_production_minmax_scaler.npy', x_prod)

Pin It on Pinterest