Revisiting Machine Learning Datasets - Banknote Authentication

Once in a while I like to explore some less-known datasets. Let’s have a look at this dataset on banknote authentication. It seems like there are no academic references so far.

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import OneHotEncoder
import catboost 
import xgboost
import keras
from keras.backend.tensorflow_backend import set_session
import tensorflow as tf 
config = tf.ConfigProto()
#config.gpu_options.per_process_gpu_memory_fraction = 0.90
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config))

from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import (Input, Dense, BatchNormalization,Dropout)
from keras import optimizers
from keras import callbacks


from keras import backend as K
K.tensorflow_backend._get_available_gpus()

from sklearn.model_selection import  train_test_split
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix



filepath_input_data = "./data/data_banknote_authentication.txt"
input_data_df = pd.read_csv(filepath_input_data,
                           names=['Variance of Wavelet Transformed Image',
                                  'Skewness of Wavelet Transformed Image',
                                  'Curtosis of Wavelet Transformed Image',
                                  'Entropy of Image', 
                                  'Class'])

display(input_data_df.head(3))
display(input_data_df.tail(3))
input_data_df.describe()


  
    
      
      Variance of Wavelet Transformed Image
      Skewness of Wavelet Transformed Image
      Curtosis of Wavelet Transformed Image
      Entropy of Image
      Class
    
  
  
    
      0
      3.6216
      8.6661
      -2.8073
      -0.44699
      0
    
    
      1
      4.5459
      8.1674
      -2.4586
      -1.46210
      0
    
    
      2
      3.8660
      -2.6383
      1.9242
      0.10645
      0

	Variance of Wavelet Transformed Image	Skewness of Wavelet Transformed Image	Curtosis of Wavelet Transformed Image	Entropy of Image
0	3.6216	8.6661	-2.8073	-0.44699
1	4.5459	8.1674	-2.4586	-1.46210
2	3.8660	-2.6383	1.9242	0.10645


  
    
      
      Variance of Wavelet Transformed Image
      Skewness of Wavelet Transformed Image
      Curtosis of Wavelet Transformed Image
      Entropy of Image
      Class
    
  
  
    
      1369
      -3.7503
      -13.45860
      17.5932
      -2.7771
      1
    
    
      1370
      -3.5637
      -8.38270
      12.3930
      -1.2823
      1
    
    
      1371
      -2.5419
      -0.65804
      2.6842
      1.1952
      1

	Variance of Wavelet Transformed Image	Skewness of Wavelet Transformed Image	Curtosis of Wavelet Transformed Image	Entropy of Image	Class
1369	-3.7503	-13.45860	17.5932	-2.7771	1
1370	-3.5637	-8.38270	12.3930	-1.2823	1
1371	-2.5419	-0.65804	2.6842	1.1952	1


  
    
      
      Variance of Wavelet Transformed Image
      Skewness of Wavelet Transformed Image
      Curtosis of Wavelet Transformed Image
      Entropy of Image
      Class
    
  
  
    
      1369
      -3.7503
      -13.45860
      17.5932
      -2.7771
      1
    
    
      1370
      -3.5637
      -8.38270
      12.3930
      -1.2823
      1
    
    
      1371
      -2.5419
      -0.65804
      2.6842
      1.1952
      1

	Variance of Wavelet Transformed Image	Skewness of Wavelet Transformed Image	Curtosis of Wavelet Transformed Image	Entropy of Image	Class
1369	-3.7503	-13.45860	17.5932	-2.7771	1
1370	-3.5637	-8.38270	12.3930	-1.2823	1
1371	-2.5419	-0.65804	2.6842	1.1952	1

Let’s have a look how the variables are distributed:

y = input_data_df['Class'].values.reshape(-1,1)
X_df = input_data_df.copy()
X_df.drop(['Class'], axis=1, inplace=True)
X = X_df.values

plt.close('all')
plt.figure(figsize=(8,(X_df.shape[-1] // 2) * 4))
idx = 1
for Feature in X_df:
    plt.subplot((X_df.shape[-1] // 2),2, idx)
    idx += 1
    plt.boxplot(X_df[Feature], meanline=False, notch=True, labels=[''])
    plt.title(Feature)
plt.tight_layout()
plt.show()

Let’s see if we can distinguish between both classes visually:

plt.figure(figsize=(11,9))
for Class in input_data_df['Class'].unique():
    plt.plot(X_df[input_data_df['Class'] == Class].values[0], label=Class)
plt.title("Examples of both classes (unscaled)")
plt.legend()
plt.show()

These samples seem pretty distinguishable.

Let’s train the algorithms on a scaled dataset and see how they perform (setup is similar to the concrete compressive strength dataset:

This was too easy ;)

I guess that this is the approach taken to prevent scanners from scanning banknotes and printers from printing them.