Once in a while I like to explore some less-known datasets. Let’s have a look at this dataset on glass identification.
Domain knowledge and original publication
The dataset originates from work by Evett and Spiehler [1]. The goal of the paper was to introduce a decision framework to decide what kind of glass was found at a crime scene or on clothes. Unfortunately, there exist no reference metrics in the original paper.
Dataset exploration
Let’s have a look at the dataset:
import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
input_file_path = "./data/glass.data"
input_header = ["Id", "RI", "Na", "Mg", "Al", "Si", "K", "Ca", "Ba", "Fe", "type"]
input_data = pd.read_csv(input_file_path,
names=input_header)
input_data.drop(["Id"], axis=1, inplace=True)
display(input_data.head(2))
display(input_data.tail(2))
input_data.describe()
RI | Na | Mg | Al | Si | K | Ca | Ba | Fe | type | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1.52101 | 13.64 | 4.49 | 1.10 | 71.78 | 0.06 | 8.75 | 0.0 | 0.0 | 1 |
1 | 1.51761 | 13.89 | 3.60 | 1.36 | 72.73 | 0.48 | 7.83 | 0.0 | 0.0 | 1 |
RI | Na | Mg | Al | Si | K | Ca | Ba | Fe | type | |
---|---|---|---|---|---|---|---|---|---|---|
212 | 1.51651 | 14.38 | 0.0 | 1.94 | 73.61 | 0.0 | 8.48 | 1.57 | 0.0 | 7 |
213 | 1.51711 | 14.23 | 0.0 | 2.08 | 73.36 | 0.0 | 8.62 | 1.67 | 0.0 | 7 |
RI | Na | Mg | Al | Si | K | Ca | Ba | Fe | type | |
---|---|---|---|---|---|---|---|---|---|---|
count | 214.000000 | 214.000000 | 214.000000 | 214.000000 | 214.000000 | 214.000000 | 214.000000 | 214.000000 | 214.000000 | 214.000000 |
mean | 1.518365 | 13.407850 | 2.684533 | 1.444907 | 72.650935 | 0.497056 | 8.956963 | 0.175047 | 0.057009 | 2.780374 |
std | 0.003037 | 0.816604 | 1.442408 | 0.499270 | 0.774546 | 0.652192 | 1.423153 | 0.497219 | 0.097439 | 2.103739 |
min | 1.511150 | 10.730000 | 0.000000 | 0.290000 | 69.810000 | 0.000000 | 5.430000 | 0.000000 | 0.000000 | 1.000000 |
25% | 1.516523 | 12.907500 | 2.115000 | 1.190000 | 72.280000 | 0.122500 | 8.240000 | 0.000000 | 0.000000 | 1.000000 |
50% | 1.517680 | 13.300000 | 3.480000 | 1.360000 | 72.790000 | 0.555000 | 8.600000 | 0.000000 | 0.000000 | 2.000000 |
75% | 1.519157 | 13.825000 | 3.600000 | 1.630000 | 73.087500 | 0.610000 | 9.172500 | 0.000000 | 0.100000 | 3.000000 |
max | 1.533930 | 17.380000 | 4.490000 | 3.500000 | 75.410000 | 6.210000 | 16.190000 | 3.150000 | 0.510000 | 7.000000 |
plt.close('all')
plt.figure(figsize=(7,(input_data.shape[-1] // 2) * 3))
idx = 1
for col in input_data:
plt.subplot((input_data.shape[-1] // 2),2, idx)
idx += 1
plt.boxplot(input_data[col], meanline=False, notch=True, labels=[''])
plt.title(col)
plt.tight_layout()
plt.show()
Since each varibale has a different scale and we want to increase convergence, it is time to scale them to [0,1]:
from sklearn.preprocessing import MaxAbsScaler
y_df = input_data["type"].copy()
X_df = input_data.copy()
X_df.drop(["type"], axis=1, inplace=True)
input_data_scaled_df = X_df.copy()
scaler = MaxAbsScaler()
input_data_scaled = scaler.fit_transform(X_df)
input_data_scaled_df.loc[:,:] = input_data_scaled
extract_scaling_function = np.ones((1,input_data_scaled_df.shape[1]))
extract_scaling_function = scaler.inverse_transform(extract_scaling_function)
Since this is my brute-force approach, let’s throw some ML algorithms at it:
from sklearn.model_selection import GridSearchCV, train_test_split
X_train, X_test, y_train, y_test = train_test_split(input_data_scaled_df, y_df,test_size=0.2, random_state=42, shuffle=True)
from sklearn.model_selection import GridSearchCV, train_test_split
X_train, X_test, y_train, y_test = train_test_split(input_data_scaled_df, y_df,test_size=0.2, random_state=42, shuffle=True)
comment = 'original dataset; scaled;'
datasets = {}
dataset_id = 0
datasets[dataset_id] = {'X_train': X_train, 'X_test' : X_test, 'y_train': y_train, 'y_test' : y_test, 'scaler' : scaler, 'scaler_array' : extract_scaling_function, 'comment' : comment, 'dataset' : dataset_id}
dataset_id +=1
import sklearn
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
from sklearn.preprocessing import OneHotEncoder
import xgboost
import keras
from keras.backend.tensorflow_backend import set_session
import tensorflow as tf
config = tf.ConfigProto()
#config.gpu_options.per_process_gpu_memory_fraction = 0.90
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config))
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import (Input, Dense, BatchNormalization,Dropout)
from keras import optimizers
from keras import callbacks
from keras import backend as K
K.tensorflow_backend._get_available_gpus()
def train_test_Gaussian_NB_classification(X_train, X_test, y_train, y_test,scorer, dataset_id):
Gaussian_NB_classification = GaussianNB()
grid_obj = sklearn.model_selection.GridSearchCV(Gaussian_NB_classification,
param_grid={},
cv=5,
n_jobs=-1,
scoring=scorer)
start_time = time.time()
grid_fit = grid_obj.fit(X_train, y_train)
training_time = time.time() - start_time
prediction = grid_fit.predict(X_test)
accuracy = sklearn.metrics.accuracy_score(y_true=y_test, y_pred=prediction)
classification_rep = sklearn.metrics.classification_report(y_true=y_test, y_pred=prediction)
confusionmatrix = confusion_matrix(y_true=y_test, y_pred=prediction)
return {'Classification type' : 'Gaussian Naive Bayes Classification',
'model' : grid_fit,
'Predictions' : prediction,
'Accuracy' : accuracy,
'Classification Report':classification_rep,
'Confusion Matrix' : confusionmatrix,
'Training time' : training_time,
'dataset' : dataset_id}
def train_test_decision_tree_classification(X_train, X_test,
y_train, y_test,
scorer, dataset_id):
decision_tree_classification = DecisionTreeClassifier(random_state=42)
grid_parameters_decision_tree_classification = {'max_depth' : [None, 3,5,7,9,10,11]}
start_time = time.time()
grid_obj = GridSearchCV(decision_tree_classification,
param_grid=grid_parameters_decision_tree_classification,
cv=5, n_jobs=-1,
scoring=scorer, verbose=1)
grid_fit = grid_obj.fit(X_train, y_train)
training_time = time.time() - start_time
best_decision_tree_classification = grid_fit.best_estimator_
prediction = best_decision_tree_classification.predict(X_test)
accuracy = accuracy_score(y_true=y_test, y_pred=prediction)
classification_rep = classification_report(y_true=y_test, y_pred=prediction)
confusionmatrix = confusion_matrix(y_true=y_test, y_pred=prediction)
return {'Classification type' : 'Decision Tree Classification',
'model' : grid_fit,
'Predictions' : prediction,
'Accuracy' : accuracy,
'Classification Report':classification_rep,
'Training time' : training_time,
'dataset' : dataset_id}
def train_test_random_forest_classification(X_train, X_test,
y_train, y_test,
scorer, dataset_id):
random_forest_classification = RandomForestClassifier(random_state=42)
# libsvm is quite slow
grid_parameters_random_forest_classification = {'n_estimators' : [3,5,10,15,18],
'max_depth' : [None, 2,3,5,7,9]}
start_time = time.time()
grid_obj = GridSearchCV(random_forest_classification,
param_grid=grid_parameters_random_forest_classification,
cv=5, n_jobs=-1,
scoring=scorer, verbose=1)
grid_fit = grid_obj.fit(X_train, y_train)
training_time = time.time() - start_time
best_random_forest_classifier = grid_fit.best_estimator_
prediction = best_random_forest_classifier.predict(X_test)
accuracy = accuracy_score(y_true=y_test, y_pred=prediction)
classification_rep = classification_report(y_true=y_test, y_pred=prediction)
confusionmatrix = confusion_matrix(y_true=y_test, y_pred=prediction)
return {'Classification type' : 'Random Forest Classification',
'model' : grid_fit,
'Predictions' : prediction,
'Accuracy' : accuracy,
'Classification Report':classification_rep,
'Confusion Matrix' : confusionmatrix,
'Training time' : training_time,
'dataset' : dataset_id}
def xgboost_classification(X_train, X_test,
y_train, y_test,
scorer, dataset_id):
x_gradient_boosting_classification = xgboost.XGBClassifier(silent=True, random_state=42)
grid_parameters_x_gradient_boosting_classification = {'n_estimators' : [3,60,120,150,200],
'max_depth' : [3,15],
'learning_rate' :[0.001, 0.01, 0.1],
'booster' : ['gbtree', 'dart']}
start_time = time.time()
grid_obj = GridSearchCV(x_gradient_boosting_classification,
param_grid=grid_parameters_x_gradient_boosting_classification,
cv=5, n_jobs=-1,
scoring=scorer, verbose=1)
grid_fit = grid_obj.fit(X_train, y_train)
training_time = time.time() - start_time
best_gradient_boosting_classification = grid_fit.best_estimator_
prediction = best_gradient_boosting_classification.predict(X_test)
accuracy = accuracy_score(y_true=y_test, y_pred=prediction)
classification_rep = classification_report(y_true=y_test, y_pred=prediction)
confusionmatrix = confusion_matrix(y_true=y_test, y_pred=prediction)
return {'Classification type' : 'XGBoost Classification',
'model' : grid_fit,
'Predictions' : prediction,
'Accuracy' : accuracy,
'Classification Report':classification_rep,
'Confusion Matrix' : confusionmatrix,
'Training time' : training_time,
'dataset' : dataset_id}
def build_baseline_model_1(input_dim,output_dim):
model = Sequential()
model.add(Dense(input_dim, input_dim=input_dim, activation='relu'))
model.add(Dense(input_dim*2, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(input_dim*2, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(input_dim//2, activation='relu'))
model.add(Dense(output_dim, activation='softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='adam', metrics=['accuracy'])
return model
def run_baseline_model_1(X_train, X_test,
y_train, y_test,
dataset_id, epochs=1000,
validation_split=0.2,
batch_size=32):
OHE = OneHotEncoder(sparse=False)
OHE.fit(np.asarray(y_train).reshape(-1,1))
y_train_OHE = OHE.transform(np.asarray(y_train).reshape(-1,1))
y_test_OHE = OHE.transform(np.asarray(y_test).reshape(-1,1))
model = build_baseline_model_1(X_train.shape[1],
output_dim=y_train_OHE.shape[1])
callback_file_path = 'keras_models/baseline_model_1_'+str(dataset_id)+'_best.hdf5'
checkpoint = callbacks.ModelCheckpoint(callback_file_path,
monitor='val_loss',
save_best_only=True,
save_weights_only=True)
start_time = time.time()
model.fit(X_train, y_train_OHE,callbacks=[checkpoint],
batch_size=batch_size, epochs=epochs,
validation_split=validation_split)
training_time = time.time() - start_time
history= model.history.history
# load best model
model.load_weights(callback_file_path)
prediction = model.predict(X_test)
prediction = OHE.inverse_transform(prediction)
accuracy = accuracy_score(y_true=y_test, y_pred=prediction)
classification_rep = classification_report(y_true=y_test, y_pred=prediction)
confusionmatrix = confusion_matrix(y_true=y_test, y_pred=prediction)
return {'Classification type' : 'Baseline_model_1',
'model' : [callback_file_path,history],
'Predictions' : prediction,
'Accuracy' : accuracy,
'Classification Report':classification_rep,
'Confusion Matrix' : confusionmatrix,
'Training time' : training_time,
'dataset' : dataset_id}
#[...]
Well, it looks like XGBoost performs best. I’m wondering why the neural networks perform so bad. Perhaps they are a bit to deep for this dataset already.
References
[1] Evett, I. W. and E. J. Spiehler (1987): Rule Induction in Forensic Science. In: KBS in Government, 107 - 118. available online: http://www.cs.ucl.ac.uk/staff/W.Langdon/ftp/papers/evett_1987_rifs.pdf.