Hyperparameter Search With Optuna: Part 3 – Keras (CNN) Classification and Ensembling

Hyperparameter Search With Optuna: Part 1 – Scikit-learn Classification and Ensembling
Hyperparameter Search With Optuna: Part 2 – XGBoost Classification and Ensembling

1. Introduction

In this article, we use the tree-structured Parzen algorithm via Optuna to find hyperparameters for a convolutional neural network (CNN) with Keras for the the MNIST handwritten digits data set classification problem.

2. Using Optuna With Keras

To integrate Keras with Optuna, we use the following class.

class Objective(object):
    def __init__(self, xcalib, ycalib, dir_save,
                 max_epochs, early_stop, learn_rate_epochs,
                 input_shape, number_of_classes):
        self.xcalib = xcalib
        self.ycalib = ycalib
        self.max_epochs = max_epochs
        self.early_stop = early_stop
        self.dir_save = dir_save
        self.learn_rate_epochs = learn_rate_epochs
        self.input_shape = input_shape
        self.number_of_classes = number_of_classes

    def __call__(self, trial):        
        num_cnn_blocks = trial.suggest_int('num_cnn_blocks', 2, 4)
        num_filters = trial.suggest_categorical('num_filters', [16, 32, 48, 64])
        kernel_size = trial.suggest_int('kernel_size', 2, 4)
        num_dense_nodes = trial.suggest_categorical('num_dense_nodes',
                                                    [64, 128, 512, 1024])
        dense_nodes_divisor = trial.suggest_categorical('dense_nodes_divisor',
                                                        [2, 4, 8])
        batch_size = trial.suggest_categorical('batch_size', [32, 64, 96, 128])
        drop_out=trial.suggest_discrete_uniform('drop_out', 0.05, 0.5, 0.05)
                         
        dict_params = {'num_cnn_blocks':num_cnn_blocks,
                       'num_filters':num_filters,
                       'kernel_size':kernel_size,
                       'num_dense_nodes':num_dense_nodes,
                       'dense_nodes_divisor':dense_nodes_divisor,
                       'batch_size':batch_size,
                       'drop_out':drop_out}
                                             
        # start of cnn coding   
        input_tensor = Input(shape=self.input_shape)
        
        # 1st cnn block
        x = BatchNormalization()(input_tensor)
        x = Activation('relu')(x)
        x = Conv2D(filters=dict_params['num_filters'],
                   kernel_size=dict_params['kernel_size'],
                   strides=1, padding='same')(x)
        #x = MaxPooling2D()(x)
        x = Dropout(dict_params['drop_out'])(x)
        
        # additional cnn blocks
        for iblock in range(dict_params['num_cnn_blocks'] - 1):
            x = BatchNormalization()(x)
            x = Activation('relu')(x)
            x = Conv2D(filters=dict_params['num_filters'],
                       kernel_size=dict_params['kernel_size'],
                       strides=1, padding='same')(x)
            x = MaxPooling2D()(x)
            x = Dropout(dict_params['drop_out'])(x)
                        
        # mlp
        x = Flatten()(x)
        x = Dense(dict_params['num_dense_nodes'], activation='relu')(x)
        x = Dropout(dict_params['drop_out'])(x)
        x = Dense(dict_params['num_dense_nodes']//dict_params['dense_nodes_divisor'], 
                  activation='relu')(x)
        output_tensor = Dense(self.number_of_classes, activation='softmax')(x)
        
        # instantiate and compile model
        cnn_model = Model(inputs=input_tensor, outputs=output_tensor)
        opt = Adam(lr=0.00025)  # default = 0.001
        cnn_model.compile(loss='categorical_crossentropy',
                          optimizer=opt, metrics=['accuracy'])
        
        
        # callbacks for early stopping and for learning rate reducer
        fn = self.dir_save + str(trial.number) + '_cnn.h5'
        callbacks_list = [EarlyStopping(monitor='val_loss', patience=self.early_stop),                     
                          ReduceLROnPlateau(monitor='val_loss', factor=0.1, 
                                            patience=self.learn_rate_epochs, 
                                            verbose=0, mode='auto', min_lr=1.0e-6),
                          ModelCheckpoint(filepath=fn,
                                          monitor='val_loss', save_best_only=True)]
            
        # fit the model
        h = cnn_model.fit(x=self.xcalib, y=self.ycalib,
                          batch_size=dict_params['batch_size'],
                          epochs=self.max_epochs,
                          validation_split=0.25,
                          shuffle=True, verbose=0,
                          callbacks=callbacks_list)
                
        validation_loss = np.min(h.history['val_loss'])
                
        return validation_loss

We are compelled to use a class here instead of simply using a function due to how Optuna has been coded (see comments here).

See remarks in our article Hyperparameter Search With Optuna: Part 1 – Scikit-learn Classification and Ensembling for how different types of parameter ranges are expressed in Optuna.

We decided to save all Keras models using the ModelCheckpoint function. The actual number of models used in production will be less than this due to applying threshold values for validation loss and number of accepted models.

In the main code, we have:

maximum_epochs = 1000
early_stop_epochs = 10
learning_rate_epochs = 5
optimizer_direction = 'minimize'
number_of_random_points = 25  # random searches to start opt process
maximum_time = 4*60*60  # seconds

objective = Objective(x_calib, y_calib, results_directory,
                      maximum_epochs, early_stop_epochs,
                      learning_rate_epochs, shape_of_input, num_classes)

optuna.logging.set_verbosity(optuna.logging.WARNING)
study = optuna.create_study(direction=optimizer_direction,
        sampler=TPESampler(n_startup_trials=number_of_random_points))

study.optimize(objective, timeout=maximum_time)

# save results
df_results = study.trials_dataframe()
df_results.to_pickle(results_directory + 'df_optuna_results.pkl')
df_results.to_csv(results_directory + 'df_optuna_results.csv')

One of the most useful aspects of Optuna is the ability to save information about each trial in a DataFrame. See our previous article for information about other Optuna parameters in this code fragment.

3. Results

Below is the DataFrame from the Optuna study. We sorted by the ‘value’ columns (this is the validation loss) and only kept the 25 best results.

Optuna Results DataFrame

number	value	params_batch_size	params_dense_nodes_divisor	params_drop_out	params_kernel_size	params_num_cnn_blocks	params_num_dense_nodes	params_num_filters	system_attrs__number	state
32	0.0280	64	8	0.4	3	4	1024	48	32	COMPLETE
25	0.0290	64	8	0.05	4	4	1024	48	25	COMPLETE
27	0.0294	64	8	0.4	3	4	1024	48	27	COMPLETE
10	0.0296	64	8	0.3	3	3	512	48	10	COMPLETE
41	0.0297	32	8	0.4	3	4	1024	48	41	COMPLETE
12	0.0304	32	4	0.3	3	4	1024	48	12	COMPLETE
0	0.0306	32	4	0.1	4	4	128	16	0	COMPLETE
30	0.0307	64	8	0.5	4	4	1024	48	30	COMPLETE
14	0.0308	128	4	0.1	3	4	128	64	14	COMPLETE
1	0.0308	32	2	0.2	3	3	512	16	1	COMPLETE
28	0.0325	64	8	0.5	4	4	1024	48	28	COMPLETE
29	0.0326	64	8	0.4	3	4	1024	48	29	COMPLETE
8	0.0328	128	2	0.2	2	3	512	32	8	COMPLETE
26	0.0330	64	8	0.05	4	4	1024	48	26	COMPLETE
9	0.0336	128	4	0.2	2	3	128	32	9	COMPLETE
45	0.0343	96	8	0.5	3	4	1024	48	45	COMPLETE
48	0.0348	64	8	0.5	4	3	1024	48	48	COMPLETE
4	0.0348	128	2	0.4	4	3	128	16	4	COMPLETE
17	0.0353	128	2	0.25	4	3	128	16	17	COMPLETE
21	0.0353	96	4	0.1	3	3	128	32	21	COMPLETE
19	0.0353	64	4	0.15	3	3	512	64	19	COMPLETE
40	0.0360	64	8	0.5	3	4	1024	48	40	COMPLETE
18	0.0361	32	4	0.05	2	4	512	48	18	COMPLETE
5	0.0363	32	2	0.35	2	3	512	32	5	COMPLETE
33	0.0364	64	8	0.5	3	4	1024	48	33	COMPLETE

To create the final result, we set a minimum loss threshold of 0.04 and only used the 25 best models. Then we averaged the resulting class probabilities and used plurality voting to obtain final class predictions.

def make_final_predictions(xprod, yprod, 
                           list_class_names, models_directory, 
                           save_directory, df_params,
                           threshold, ml_name, num_models_accept, 
                           optimization_direction):
    
    if optimization_direction == 'maximize':
        df_params.sort_values(by='value', ascending=False, inplace=True)
    else:
        df_params.sort_values(by='value', ascending=True, inplace=True)
    
    # apply threshold
    accepted_models_num = 0
    list_predicted_prob = []
    num_models_total = df_params.shape[0]
    for i in range(num_models_total):
        if optimization_direction == 'maximize':
            bool1 = df_params.loc[df_params.index[i],'value'] > threshold
        else:
            bool1 = df_params.loc[df_params.index[i],'value'] < threshold
                    
        bool2 = df_params.loc[df_params.index[i],'state'] == 'COMPLETE'
        bool3 = accepted_models_num < num_models_accept
        if bool1 and bool2 and bool3:
            model_number = str(df_params.loc[df_params.index[i],'number'])
            
            try:
                cnn_model = load_model(models_directory + model_number + '_cnn.h5')
            except:
                print('\ncould not read model number:',model_number)
            else:
                list_predicted_prob.append(cnn_model.predict(xprod))
                accepted_models_num = accepted_models_num + 1
    
    # compute mean probabilities
    mean_probabilities = np.mean(list_predicted_prob, axis=0)
    
    # compute predicted class
    # argmax uses 1st ocurrance in case of a tie
    y_predicted_class = np.argmax(mean_probabilities, axis=1)

balanced accuracy score = 0.9945
accuracy score = 0.9946

4. Code

Below is the code that has not already been shown in the sections above.

import os
import numpy as np
import pandas as pd
from keras.layers import Dense, Conv2D, BatchNormalization
from keras.layers import MaxPooling2D
from keras.layers import Input, Flatten, Dropout
from keras.layers import Activation
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras.models import Model, load_model
from keras.utils import to_categorical
import optuna
from optuna.samplers import TPESampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
import time
from pathlib import Path

# *******************************************************************

# raw shapes: x:(num pts, 784), y:(num pts,)

def get_data_mnist(data_type, data_dir):
    
    data_dictionary = {}
    
    if data_type == 'calibration':
        # x data - reshape = (60000, 28, 28, 1) - channels last
        x_calib_raw = np.load(data_dir + 'x_mnist_calibration.npy', allow_pickle=True)
        x_calib_raw = x_calib_raw.astype('float32')/255.0  # normalize
        x_calib = np.reshape(x_calib_raw, (x_calib_raw.shape[0], 28, 28, 1))
        
        # y data - shape = (60000, ) -> (60000, 10)
        y_calib = to_categorical(np.load(data_dir + 'y_mnist_calibration.npy', 
                                         allow_pickle=True))
        
        # split
        x_calib1, x_calib2, y_calib1, y_calib2 = train_test_split(x_calib, y_calib,
        train_size=0.5, shuffle=True, stratify=y_calib)
        
        data_dictionary['x_calib1'] = x_calib1
        data_dictionary['y_calib1'] = y_calib1
        
        data_dictionary['x_calib2'] = x_calib2
        data_dictionary['y_calib2'] = y_calib2
        
    elif data_type == 'production':
        # x data - reshape = (10000, 28, 28, 1) - channels last
        x_prod_raw = np.load(data_dir + 'x_mnist_production.npy', allow_pickle=True)
        x_prod_raw = x_prod_raw.astype('float32')/255.0  # normalize
        x_prod = np.reshape(x_prod_raw, (x_prod_raw.shape[0], 28, 28, 1))
        
        # y data - shape = (10000, )
        y_prod = np.load(data_dir + 'y_mnist_production.npy', allow_pickle=True)
        
        data_dictionary['x_prod'] = x_prod
        data_dictionary['y_prod'] = y_prod
        
    else:
        raise NameError
   
    return data_dictionary
    
# end of get_data_mnist()

# *******************************************************************

if __name__ == '__main__':    
    
    ml_algorithm_name = 'cnn'
    file_name_stub = 'optuna_' + ml_algorithm_name 
    
    calculation_type = 'production' #'calibration' 'production'
    
    base_directory = YOUR DIRECTORY
    data_directory = base_directory + 'data/'
    
    
    results_directory_stub = base_directory + file_name_stub + '/'
    if not Path(results_directory_stub).is_dir():
        os.mkdir(results_directory_stub)
              
    # fixed parameters - calibration
    maximum_epochs = 1000
    early_stop_epochs = 10
    learning_rate_epochs = 5
    optimizer_direction = 'minimize'
    number_of_random_points = 25  # random searches to start opt process
    maximum_time = 4*60*60  # seconds
              
    # fixed parameters - production
    threshold_error = 0.04  # validation loss
    number_of_models = 25
    
    # read data
    dict_data = get_data_mnist(calculation_type, data_directory)
            
    print('\n*** starting at',pd.Timestamp.now())
    start_time_total = time.time()

    # calibration 
    if calculation_type == 'calibration':
        
        # using 1/2 of calib data
        x_calib = dict_data['x_calib1']
        y_calib = dict_data['y_calib1']
        
        shape_of_input = x_calib.shape[1:] # (28, 28, 1) - channels last
        num_classes = y_calib.shape[1]
                
        results_directory = results_directory_stub + calculation_type + '/'
        if not Path(results_directory).is_dir():
            os.mkdir(results_directory)
                
        objective = Objective(x_calib, y_calib, results_directory,
                              maximum_epochs, early_stop_epochs,
                              learning_rate_epochs, shape_of_input, num_classes)
    
        optuna.logging.set_verbosity(optuna.logging.WARNING)
        study = optuna.create_study(direction=optimizer_direction,
                sampler=TPESampler(n_startup_trials=number_of_random_points))
        
        study.optimize(objective, timeout=maximum_time)
        
        # save results
        df_results = study.trials_dataframe()
        df_results.to_pickle(results_directory + 'df_optuna_results.pkl')
        df_results.to_csv(results_directory + 'df_optuna_results.csv')
        
        
        elapsed_time_total = (time.time()-start_time_total)/60
        print('\n\ntotal elapsed time =',elapsed_time_total,' minutes')
                          
    elif calculation_type == 'production':
        
        # get optuna results parameters
        models_dir = results_directory_stub + 'calibration/'
        df_parameters = pd.read_pickle(models_dir + 'df_optuna_results.pkl')
        
        results_directory = results_directory_stub + calculation_type + '/'
        if not Path(results_directory).is_dir():
            os.mkdir(results_directory)
            
            
        x_prod = dict_data['x_prod']
        y_prod = dict_data['y_prod']
        
        num_classes = np.unique(y_prod).shape[0]
        class_names_list = []
        for i in range(num_classes):
            class_names_list.append('class ' + str(i))
                
        make_final_predictions(x_prod, y_prod, class_names_list, 
                               models_dir, results_directory, 
                               df_parameters,  
                               threshold_error, file_name_stub,
                               number_of_models, optimizer_direction)
             
    else:
        print('\ninvalid calculation type:',calculation_type)
        raise NameError