Hyperparameter Search With Optuna: Part 1 – Scikit-learn Classification and Ensembling
Hyperparameter Search With Optuna: Part 2 – XGBoost Classification and Ensembling

  1. Introduction
  2. Using Optuna With Keras
  3. Results
  4. Code

1. Introduction

In this article, we use the tree-structured Parzen algorithm via Optuna to find hyperparameters for a convolutional neural network (CNN) with Keras for the the MNIST handwritten digits data set classification problem.

2. Using Optuna With Keras

To integrate Keras with Optuna, we use the following class.

class Objective(object):
    def __init__(self, xcalib, ycalib, dir_save,
                 max_epochs, early_stop, learn_rate_epochs,
                 input_shape, number_of_classes):
        self.xcalib = xcalib
        self.ycalib = ycalib
        self.max_epochs = max_epochs
        self.early_stop = early_stop
        self.dir_save = dir_save
        self.learn_rate_epochs = learn_rate_epochs
        self.input_shape = input_shape
        self.number_of_classes = number_of_classes

    def __call__(self, trial):        
        num_cnn_blocks = trial.suggest_int('num_cnn_blocks', 2, 4)
        num_filters = trial.suggest_categorical('num_filters', [16, 32, 48, 64])
        kernel_size = trial.suggest_int('kernel_size', 2, 4)
        num_dense_nodes = trial.suggest_categorical('num_dense_nodes',
                                                    [64, 128, 512, 1024])
        dense_nodes_divisor = trial.suggest_categorical('dense_nodes_divisor',
                                                        [2, 4, 8])
        batch_size = trial.suggest_categorical('batch_size', [32, 64, 96, 128])
        drop_out=trial.suggest_discrete_uniform('drop_out', 0.05, 0.5, 0.05)
                         
        dict_params = {'num_cnn_blocks':num_cnn_blocks,
                       'num_filters':num_filters,
                       'kernel_size':kernel_size,
                       'num_dense_nodes':num_dense_nodes,
                       'dense_nodes_divisor':dense_nodes_divisor,
                       'batch_size':batch_size,
                       'drop_out':drop_out}
                                             
        # start of cnn coding   
        input_tensor = Input(shape=self.input_shape)
        
        # 1st cnn block
        x = BatchNormalization()(input_tensor)
        x = Activation('relu')(x)
        x = Conv2D(filters=dict_params['num_filters'],
                   kernel_size=dict_params['kernel_size'],
                   strides=1, padding='same')(x)
        #x = MaxPooling2D()(x)
        x = Dropout(dict_params['drop_out'])(x)
        
        # additional cnn blocks
        for iblock in range(dict_params['num_cnn_blocks'] - 1):
            x = BatchNormalization()(x)
            x = Activation('relu')(x)
            x = Conv2D(filters=dict_params['num_filters'],
                       kernel_size=dict_params['kernel_size'],
                       strides=1, padding='same')(x)
            x = MaxPooling2D()(x)
            x = Dropout(dict_params['drop_out'])(x)
                        
        # mlp
        x = Flatten()(x)
        x = Dense(dict_params['num_dense_nodes'], activation='relu')(x)
        x = Dropout(dict_params['drop_out'])(x)
        x = Dense(dict_params['num_dense_nodes']//dict_params['dense_nodes_divisor'], 
                  activation='relu')(x)
        output_tensor = Dense(self.number_of_classes, activation='softmax')(x)
        
        # instantiate and compile model
        cnn_model = Model(inputs=input_tensor, outputs=output_tensor)
        opt = Adam(lr=0.00025)  # default = 0.001
        cnn_model.compile(loss='categorical_crossentropy',
                          optimizer=opt, metrics=['accuracy'])
        
        
        # callbacks for early stopping and for learning rate reducer
        fn = self.dir_save + str(trial.number) + '_cnn.h5'
        callbacks_list = [EarlyStopping(monitor='val_loss', patience=self.early_stop),                     
                          ReduceLROnPlateau(monitor='val_loss', factor=0.1, 
                                            patience=self.learn_rate_epochs, 
                                            verbose=0, mode='auto', min_lr=1.0e-6),
                          ModelCheckpoint(filepath=fn,
                                          monitor='val_loss', save_best_only=True)]
            
        # fit the model
        h = cnn_model.fit(x=self.xcalib, y=self.ycalib,
                          batch_size=dict_params['batch_size'],
                          epochs=self.max_epochs,
                          validation_split=0.25,
                          shuffle=True, verbose=0,
                          callbacks=callbacks_list)
                
        validation_loss = np.min(h.history['val_loss'])
                
        return validation_loss

We are compelled to use a class here instead of simply using a function due to how Optuna has been coded (see comments here).

See remarks in our article Hyperparameter Search With Optuna: Part 1 – Scikit-learn Classification and Ensembling for how different types of parameter ranges are expressed in Optuna.

We decided to save all Keras models using the ModelCheckpoint function. The actual number of models used in production will be less than this due to applying threshold values for validation loss and number of accepted models.

In the main code, we have:

maximum_epochs = 1000
early_stop_epochs = 10
learning_rate_epochs = 5
optimizer_direction = 'minimize'
number_of_random_points = 25  # random searches to start opt process
maximum_time = 4*60*60  # seconds

objective = Objective(x_calib, y_calib, results_directory,
                      maximum_epochs, early_stop_epochs,
                      learning_rate_epochs, shape_of_input, num_classes)

optuna.logging.set_verbosity(optuna.logging.WARNING)
study = optuna.create_study(direction=optimizer_direction,
        sampler=TPESampler(n_startup_trials=number_of_random_points))

study.optimize(objective, timeout=maximum_time)

# save results
df_results = study.trials_dataframe()
df_results.to_pickle(results_directory + 'df_optuna_results.pkl')
df_results.to_csv(results_directory + 'df_optuna_results.csv')

One of the most useful aspects of Optuna is the ability to save information about each trial in a DataFrame. See our previous article for information about other Optuna parameters in this code fragment.

3. Results

Below is the DataFrame from the Optuna study. We sorted by the ‘value’ columns (this is the validation loss) and only kept the 25 best results.

Optuna Results DataFrame

numbervalueparams_batch_sizeparams_dense_nodes_divisorparams_drop_outparams_kernel_sizeparams_num_cnn_blocksparams_num_dense_nodesparams_num_filterssystem_attrs__numberstate
320.02806480.43410244832COMPLETE
250.02906480.054410244825COMPLETE
270.02946480.43410244827COMPLETE
100.02966480.3335124810COMPLETE
410.02973280.43410244841COMPLETE
120.03043240.33410244812COMPLETE
00.03063240.144128160COMPLETE
300.03076480.54410244830COMPLETE
140.030812840.1341286414COMPLETE
10.03083220.233512161COMPLETE
280.03256480.54410244828COMPLETE
290.03266480.43410244829COMPLETE
80.032812820.223512328COMPLETE
260.03306480.054410244826COMPLETE
90.033612840.223128329COMPLETE
450.03439680.53410244845COMPLETE
480.03486480.54310244848COMPLETE
40.034812820.443128164COMPLETE
170.035312820.25431281617COMPLETE
210.03539640.1331283221COMPLETE
190.03536440.15335126419COMPLETE
400.03606480.53410244840COMPLETE
180.03613240.05245124818COMPLETE
50.03633220.3523512325COMPLETE
330.03646480.53410244833COMPLETE

To create the final result, we set a minimum loss threshold of 0.04 and only used the 25 best models. Then we averaged the resulting class probabilities and used plurality voting to obtain final class predictions.

def make_final_predictions(xprod, yprod, 
                           list_class_names, models_directory, 
                           save_directory, df_params,
                           threshold, ml_name, num_models_accept, 
                           optimization_direction):
    
    if optimization_direction == 'maximize':
        df_params.sort_values(by='value', ascending=False, inplace=True)
    else:
        df_params.sort_values(by='value', ascending=True, inplace=True)
    
    # apply threshold
    accepted_models_num = 0
    list_predicted_prob = []
    num_models_total = df_params.shape[0]
    for i in range(num_models_total):
        if optimization_direction == 'maximize':
            bool1 = df_params.loc[df_params.index[i],'value'] > threshold
        else:
            bool1 = df_params.loc[df_params.index[i],'value'] < threshold
                    
        bool2 = df_params.loc[df_params.index[i],'state'] == 'COMPLETE'
        bool3 = accepted_models_num < num_models_accept
        if bool1 and bool2 and bool3:
            model_number = str(df_params.loc[df_params.index[i],'number'])
            
            try:
                cnn_model = load_model(models_directory + model_number + '_cnn.h5')
            except:
                print('\ncould not read model number:',model_number)
            else:
                list_predicted_prob.append(cnn_model.predict(xprod))
                accepted_models_num = accepted_models_num + 1
    
    # compute mean probabilities
    mean_probabilities = np.mean(list_predicted_prob, axis=0)
    
    # compute predicted class
    # argmax uses 1st ocurrance in case of a tie
    y_predicted_class = np.argmax(mean_probabilities, axis=1)

balanced accuracy score = 0.9945
accuracy score = 0.9946


4. Code

Below is the code that has not already been shown in the sections above.

import os
import numpy as np
import pandas as pd
from keras.layers import Dense, Conv2D, BatchNormalization
from keras.layers import MaxPooling2D
from keras.layers import Input, Flatten, Dropout
from keras.layers import Activation
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras.models import Model, load_model
from keras.utils import to_categorical
import optuna
from optuna.samplers import TPESampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
import time
from pathlib import Path

# *******************************************************************

# raw shapes: x:(num pts, 784), y:(num pts,)

def get_data_mnist(data_type, data_dir):
    
    data_dictionary = {}
    
    if data_type == 'calibration':
        # x data - reshape = (60000, 28, 28, 1) - channels last
        x_calib_raw = np.load(data_dir + 'x_mnist_calibration.npy', allow_pickle=True)
        x_calib_raw = x_calib_raw.astype('float32')/255.0  # normalize
        x_calib = np.reshape(x_calib_raw, (x_calib_raw.shape[0], 28, 28, 1))
        
        # y data - shape = (60000, ) -> (60000, 10)
        y_calib = to_categorical(np.load(data_dir + 'y_mnist_calibration.npy', 
                                         allow_pickle=True))
        
        # split
        x_calib1, x_calib2, y_calib1, y_calib2 = train_test_split(x_calib, y_calib,
        train_size=0.5, shuffle=True, stratify=y_calib)
        
        data_dictionary['x_calib1'] = x_calib1
        data_dictionary['y_calib1'] = y_calib1
        
        data_dictionary['x_calib2'] = x_calib2
        data_dictionary['y_calib2'] = y_calib2
        
    elif data_type == 'production':
        # x data - reshape = (10000, 28, 28, 1) - channels last
        x_prod_raw = np.load(data_dir + 'x_mnist_production.npy', allow_pickle=True)
        x_prod_raw = x_prod_raw.astype('float32')/255.0  # normalize
        x_prod = np.reshape(x_prod_raw, (x_prod_raw.shape[0], 28, 28, 1))
        
        # y data - shape = (10000, )
        y_prod = np.load(data_dir + 'y_mnist_production.npy', allow_pickle=True)
        
        data_dictionary['x_prod'] = x_prod
        data_dictionary['y_prod'] = y_prod
        
    else:
        raise NameError
   
    return data_dictionary
    
# end of get_data_mnist()

# *******************************************************************

if __name__ == '__main__':    
    
    ml_algorithm_name = 'cnn'
    file_name_stub = 'optuna_' + ml_algorithm_name 
    
    calculation_type = 'production' #'calibration' 'production'
    
    base_directory = YOUR DIRECTORY
    data_directory = base_directory + 'data/'
    
    
    results_directory_stub = base_directory + file_name_stub + '/'
    if not Path(results_directory_stub).is_dir():
        os.mkdir(results_directory_stub)
              
    # fixed parameters - calibration
    maximum_epochs = 1000
    early_stop_epochs = 10
    learning_rate_epochs = 5
    optimizer_direction = 'minimize'
    number_of_random_points = 25  # random searches to start opt process
    maximum_time = 4*60*60  # seconds
              
    # fixed parameters - production
    threshold_error = 0.04  # validation loss
    number_of_models = 25
    
    # read data
    dict_data = get_data_mnist(calculation_type, data_directory)
            
    print('\n*** starting at',pd.Timestamp.now())
    start_time_total = time.time()

    # calibration 
    if calculation_type == 'calibration':
        
        # using 1/2 of calib data
        x_calib = dict_data['x_calib1']
        y_calib = dict_data['y_calib1']
        
        shape_of_input = x_calib.shape[1:] # (28, 28, 1) - channels last
        num_classes = y_calib.shape[1]
                
        results_directory = results_directory_stub + calculation_type + '/'
        if not Path(results_directory).is_dir():
            os.mkdir(results_directory)
                
        objective = Objective(x_calib, y_calib, results_directory,
                              maximum_epochs, early_stop_epochs,
                              learning_rate_epochs, shape_of_input, num_classes)
    
        optuna.logging.set_verbosity(optuna.logging.WARNING)
        study = optuna.create_study(direction=optimizer_direction,
                sampler=TPESampler(n_startup_trials=number_of_random_points))
        
        study.optimize(objective, timeout=maximum_time)
        
        # save results
        df_results = study.trials_dataframe()
        df_results.to_pickle(results_directory + 'df_optuna_results.pkl')
        df_results.to_csv(results_directory + 'df_optuna_results.csv')
        
        
        elapsed_time_total = (time.time()-start_time_total)/60
        print('\n\ntotal elapsed time =',elapsed_time_total,' minutes')
                          
    elif calculation_type == 'production':
        
        # get optuna results parameters
        models_dir = results_directory_stub + 'calibration/'
        df_parameters = pd.read_pickle(models_dir + 'df_optuna_results.pkl')
        
        results_directory = results_directory_stub + calculation_type + '/'
        if not Path(results_directory).is_dir():
            os.mkdir(results_directory)
            
            
        x_prod = dict_data['x_prod']
        y_prod = dict_data['y_prod']
        
        num_classes = np.unique(y_prod).shape[0]
        class_names_list = []
        for i in range(num_classes):
            class_names_list.append('class ' + str(i))
                
        make_final_predictions(x_prod, y_prod, class_names_list, 
                               models_dir, results_directory, 
                               df_parameters,  
                               threshold_error, file_name_stub,
                               number_of_models, optimizer_direction)
             
    else:
        print('\ninvalid calculation type:',calculation_type)
        raise NameError