Once in a while I like to explore some less-known datasets. Let’s have a look at this dataset on glass identification.

Domain knowledge and original publication

The dataset originates from work by Evett and Spiehler [1]. The goal of the paper was to introduce a decision framework to decide what kind of glass was found at a crime scene or on clothes. Unfortunately, there exist no reference metrics in the original paper.

Dataset exploration

Let’s have a look at the dataset:

import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

input_file_path = "./data/glass.data"
input_header = ["Id", "RI", "Na", "Mg", "Al", "Si", "K", "Ca", "Ba", "Fe", "type"]
input_data = pd.read_csv(input_file_path, 
                         names=input_header)
input_data.drop(["Id"], axis=1, inplace=True)

display(input_data.head(2))
display(input_data.tail(2))
input_data.describe()

RI Na Mg Al Si K Ca Ba Fe type
0 1.52101 13.64 4.49 1.10 71.78 0.06 8.75 0.0 0.0 1
1 1.51761 13.89 3.60 1.36 72.73 0.48 7.83 0.0 0.0 1
RI Na Mg Al Si K Ca Ba Fe type
212 1.51651 14.38 0.0 1.94 73.61 0.0 8.48 1.57 0.0 7
213 1.51711 14.23 0.0 2.08 73.36 0.0 8.62 1.67 0.0 7
RI Na Mg Al Si K Ca Ba Fe type
count 214.000000 214.000000 214.000000 214.000000 214.000000 214.000000 214.000000 214.000000 214.000000 214.000000
mean 1.518365 13.407850 2.684533 1.444907 72.650935 0.497056 8.956963 0.175047 0.057009 2.780374
std 0.003037 0.816604 1.442408 0.499270 0.774546 0.652192 1.423153 0.497219 0.097439 2.103739
min 1.511150 10.730000 0.000000 0.290000 69.810000 0.000000 5.430000 0.000000 0.000000 1.000000
25% 1.516523 12.907500 2.115000 1.190000 72.280000 0.122500 8.240000 0.000000 0.000000 1.000000
50% 1.517680 13.300000 3.480000 1.360000 72.790000 0.555000 8.600000 0.000000 0.000000 2.000000
75% 1.519157 13.825000 3.600000 1.630000 73.087500 0.610000 9.172500 0.000000 0.100000 3.000000
max 1.533930 17.380000 4.490000 3.500000 75.410000 6.210000 16.190000 3.150000 0.510000 7.000000
plt.close('all')
plt.figure(figsize=(7,(input_data.shape[-1] // 2) * 3))
idx = 1
for col in input_data:
    plt.subplot((input_data.shape[-1] // 2),2, idx)
    idx += 1
    plt.boxplot(input_data[col], meanline=False, notch=True, labels=[''])
    plt.title(col)
plt.tight_layout()
plt.show()

Since each varibale has a different scale and we want to increase convergence, it is time to scale them to [0,1]:

from sklearn.preprocessing import MaxAbsScaler

y_df = input_data["type"].copy()
X_df = input_data.copy()
X_df.drop(["type"], axis=1, inplace=True)

input_data_scaled_df = X_df.copy()
scaler = MaxAbsScaler()
input_data_scaled = scaler.fit_transform(X_df)
input_data_scaled_df.loc[:,:] = input_data_scaled


extract_scaling_function = np.ones((1,input_data_scaled_df.shape[1]))
extract_scaling_function = scaler.inverse_transform(extract_scaling_function)

Since this is my brute-force approach, let’s throw some ML algorithms at it:

from sklearn.model_selection import GridSearchCV, train_test_split
X_train, X_test, y_train, y_test = train_test_split(input_data_scaled_df, y_df,test_size=0.2, random_state=42, shuffle=True)
from sklearn.model_selection import GridSearchCV, train_test_split
X_train, X_test, y_train, y_test = train_test_split(input_data_scaled_df, y_df,test_size=0.2, random_state=42, shuffle=True)
comment = 'original dataset; scaled;'
datasets = {}
dataset_id = 0
datasets[dataset_id] = {'X_train': X_train, 'X_test' : X_test, 'y_train': y_train, 'y_test' : y_test, 'scaler' : scaler, 'scaler_array' : extract_scaling_function, 'comment' : comment, 'dataset' : dataset_id}
dataset_id +=1

import sklearn
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
from sklearn.preprocessing import OneHotEncoder

import xgboost
import keras
from keras.backend.tensorflow_backend import set_session
import tensorflow as tf 
config = tf.ConfigProto()
#config.gpu_options.per_process_gpu_memory_fraction = 0.90
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config))

from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import (Input, Dense, BatchNormalization,Dropout)
from keras import optimizers
from keras import callbacks


from keras import backend as K
K.tensorflow_backend._get_available_gpus()

def train_test_Gaussian_NB_classification(X_train, X_test, y_train, y_test,scorer, dataset_id):
    Gaussian_NB_classification = GaussianNB()
    grid_obj = sklearn.model_selection.GridSearchCV(Gaussian_NB_classification,
                                                          param_grid={},
                                                          cv=5,
                                                          n_jobs=-1,
                                                          scoring=scorer)
    start_time = time.time()
    grid_fit = grid_obj.fit(X_train, y_train)
    training_time = time.time() - start_time
    prediction = grid_fit.predict(X_test)
    accuracy = sklearn.metrics.accuracy_score(y_true=y_test, y_pred=prediction)
    classification_rep = sklearn.metrics.classification_report(y_true=y_test, y_pred=prediction)
    confusionmatrix = confusion_matrix(y_true=y_test, y_pred=prediction)
    

    return {'Classification type' : 'Gaussian Naive Bayes Classification',
            'model' : grid_fit,
            'Predictions' : prediction,
            'Accuracy' : accuracy,
            'Classification Report':classification_rep,
            'Confusion Matrix' : confusionmatrix,
            'Training time' : training_time,
            'dataset' : dataset_id}


def train_test_decision_tree_classification(X_train, X_test,
                                            y_train, y_test,
                                            scorer, dataset_id):
    decision_tree_classification = DecisionTreeClassifier(random_state=42)
    grid_parameters_decision_tree_classification = {'max_depth' : [None, 3,5,7,9,10,11]}
    start_time = time.time()
    grid_obj = GridSearchCV(decision_tree_classification,
                            param_grid=grid_parameters_decision_tree_classification,
                            cv=5, n_jobs=-1,
                            scoring=scorer, verbose=1)
    grid_fit = grid_obj.fit(X_train, y_train)
    training_time = time.time() - start_time
    best_decision_tree_classification = grid_fit.best_estimator_
    prediction = best_decision_tree_classification.predict(X_test)
    accuracy = accuracy_score(y_true=y_test, y_pred=prediction)
    classification_rep = classification_report(y_true=y_test, y_pred=prediction)
    confusionmatrix = confusion_matrix(y_true=y_test, y_pred=prediction)

    return {'Classification type' : 'Decision Tree Classification',
            'model' : grid_fit,
            'Predictions' : prediction,
            'Accuracy' : accuracy,
            'Classification Report':classification_rep,
            'Training time' : training_time,
            'dataset' : dataset_id}

def train_test_random_forest_classification(X_train, X_test,
                                            y_train, y_test,
                                            scorer, dataset_id):
    random_forest_classification = RandomForestClassifier(random_state=42)
    # libsvm is quite slow 
    grid_parameters_random_forest_classification = {'n_estimators' : [3,5,10,15,18],
                                     'max_depth' : [None, 2,3,5,7,9]}
    start_time = time.time()
    grid_obj = GridSearchCV(random_forest_classification,
                            param_grid=grid_parameters_random_forest_classification,
                            cv=5, n_jobs=-1,
                            scoring=scorer, verbose=1)
    grid_fit = grid_obj.fit(X_train, y_train)
    training_time = time.time() - start_time
    best_random_forest_classifier = grid_fit.best_estimator_
    prediction = best_random_forest_classifier.predict(X_test)
    accuracy = accuracy_score(y_true=y_test, y_pred=prediction)
    classification_rep = classification_report(y_true=y_test, y_pred=prediction)
    confusionmatrix = confusion_matrix(y_true=y_test, y_pred=prediction)
    
    
    return {'Classification type' : 'Random Forest Classification',
            'model' : grid_fit,
            'Predictions' : prediction,
            'Accuracy' : accuracy,
            'Classification Report':classification_rep,
            'Confusion Matrix' : confusionmatrix,
            'Training time' : training_time,
            'dataset' : dataset_id}

def xgboost_classification(X_train, X_test,
                           y_train, y_test,
                           scorer, dataset_id):
    x_gradient_boosting_classification = xgboost.XGBClassifier(silent=True, random_state=42)
    grid_parameters_x_gradient_boosting_classification = {'n_estimators' : [3,60,120,150,200],
                                                    'max_depth' : [3,15],
                                                    'learning_rate' :[0.001, 0.01, 0.1],
                                                    'booster' : ['gbtree', 'dart']}
    start_time = time.time()
    grid_obj = GridSearchCV(x_gradient_boosting_classification,
                            param_grid=grid_parameters_x_gradient_boosting_classification,
                            cv=5, n_jobs=-1,
                            scoring=scorer, verbose=1)
    grid_fit = grid_obj.fit(X_train, y_train)
    training_time = time.time() - start_time
    best_gradient_boosting_classification = grid_fit.best_estimator_
    prediction = best_gradient_boosting_classification.predict(X_test)
    accuracy = accuracy_score(y_true=y_test, y_pred=prediction)
    classification_rep = classification_report(y_true=y_test, y_pred=prediction)
    confusionmatrix = confusion_matrix(y_true=y_test, y_pred=prediction)
    
    
    return {'Classification type' : 'XGBoost Classification', 
            'model' : grid_fit,
            'Predictions' : prediction,
            'Accuracy' : accuracy,
            'Classification Report':classification_rep,
            'Confusion Matrix' : confusionmatrix,
            'Training time' : training_time,
            'dataset' : dataset_id}
def build_baseline_model_1(input_dim,output_dim):
    model = Sequential()
    model.add(Dense(input_dim, input_dim=input_dim, activation='relu'))
    model.add(Dense(input_dim*2, activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(input_dim*2, activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(input_dim//2, activation='relu'))
    model.add(Dense(output_dim, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam', metrics=['accuracy'])
    return model

def run_baseline_model_1(X_train, X_test,
                         y_train, y_test,
                         dataset_id, epochs=1000,
                         validation_split=0.2,
                         batch_size=32):
    OHE = OneHotEncoder(sparse=False)
    OHE.fit(np.asarray(y_train).reshape(-1,1))
    y_train_OHE = OHE.transform(np.asarray(y_train).reshape(-1,1))
    y_test_OHE = OHE.transform(np.asarray(y_test).reshape(-1,1))
    model = build_baseline_model_1(X_train.shape[1],
                                   output_dim=y_train_OHE.shape[1])
    
    callback_file_path = 'keras_models/baseline_model_1_'+str(dataset_id)+'_best.hdf5'
    checkpoint = callbacks.ModelCheckpoint(callback_file_path,
                                           monitor='val_loss',
                                           save_best_only=True,
                                           save_weights_only=True)
    
    start_time = time.time() 
    model.fit(X_train, y_train_OHE,callbacks=[checkpoint],
              batch_size=batch_size, epochs=epochs,
              validation_split=validation_split)
    training_time = time.time() - start_time
    history= model.history.history
    # load best model
    model.load_weights(callback_file_path) 
    prediction = model.predict(X_test)
    prediction = OHE.inverse_transform(prediction)
    accuracy = accuracy_score(y_true=y_test, y_pred=prediction)
    classification_rep = classification_report(y_true=y_test, y_pred=prediction)
    confusionmatrix = confusion_matrix(y_true=y_test, y_pred=prediction)
    
    return {'Classification type' : 'Baseline_model_1',
            'model' : [callback_file_path,history],
            'Predictions' : prediction,
            'Accuracy' : accuracy,
            'Classification Report':classification_rep,
            'Confusion Matrix' : confusionmatrix,
            'Training time' : training_time, 
            'dataset' : dataset_id}

#[...]

Well, it looks like XGBoost performs best. I’m wondering why the neural networks perform so bad. Perhaps they are a bit to deep for this dataset already.

References

[1] Evett, I. W. and E. J. Spiehler (1987): Rule Induction in Forensic Science. In: KBS in Government, 107 - 118. available online: http://www.cs.ucl.ac.uk/staff/W.Langdon/ftp/papers/evett_1987_rifs.pdf.