Revisiting Machine Learning Datasets

Let’s have a look at the CT Slice Localization dataset by Graf et al. (2011) as part of my series on exploring less known datasets.

Contents

Dataset exploration
Preprocessing and ML algorithms
Results

Dataset exploration

Let’s load the dataset to understand the basic idea behind it.

import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# not gentlemen-like but it helps to keep the notebook clean ;)
import warnings
warnings.simplefilter('ignore')

filepath_input_data = "./data/slice_localization_data.csv"
input_data = pd.read_csv(filepath_input_data)
display(input_data.head(5))
display(input_data.tail(5))
display(input_data.describe())


  
    
      
      patientId
      value0
      value1
      value2
      value3
      value4
      value5
      value6
      value7
      value8
      ...
      value375
      value376
      value377
      value378
      value379
      value380
      value381
      value382
      value383
      reference
    
  
  
    
      count
      53500.000000
      53500.000000
      53500.000000
      53500.000000
      53500.000000
      53500.000000
      53500.000000
      53500.000000
      53500.000000
      53500.000000
      ...
      53500.000000
      53500.000000
      53500.000000
      53500.000000
      53500.000000
      53500.000000
      53500.000000
      53500.000000
      53500.000000
      53500.000000
    
    
      mean
      47.075701
      0.059627
      0.071558
      0.145819
      0.218728
      0.274762
      0.276189
      0.204531
      0.062281
      -0.042025
      ...
      -0.029404
      0.182913
      0.320112
      0.359373
      0.342889
      0.266091
      0.083049
      -0.031146
      -0.154524
      47.028039
    
    
      std
      27.414240
      0.174243
      0.196921
      0.300270
      0.359163
      0.378862
      0.369605
      0.351294
      0.292232
      0.268391
      ...
      0.085817
      0.383333
      0.463517
      0.478188
      0.471811
      0.437633
      0.279734
      0.098738
      0.122491
      22.347042
    
    
      min
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      -0.250000
      -0.250000
      -0.250000
      -0.250000
      ...
      -0.250000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      -0.250000
      -0.250000
      -0.250000
      1.738733
    
    
      25%
      23.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      -0.250000
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      -0.250000
      29.891607
    
    
      50%
      46.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      -0.250000
      43.987893
    
    
      75%
      70.000000
      0.000000
      0.000000
      0.000000
      0.446429
      0.684477
      0.662382
      0.441412
      0.000000
      0.000000
      ...
      0.000000
      0.000000
      0.996286
      0.999677
      0.999560
      0.949478
      0.000000
      0.000000
      0.000000
      63.735059
    
    
      max
      96.000000
      1.000000
      1.000000
      1.000000
      1.000000
      0.998790
      0.996468
      0.999334
      1.000000
      1.000000
      ...
      0.961279
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      0.999857
      0.996839
      0.942851
      97.489115

The data contains a no labels. Hence, we have to read the documentation on the UCI ML repository. The first colum is the patient ID, the last one is the location of the slice ranging from 0 to 180 where 0 denotes a slice at the top of the head. The other columns represent two histrograms. The first one is the histogram of bone structure of a slice and the second one of detected air inclusions. Lets visualize both histograms: bone structure and air inclusions. Therefore, we have to extract some information from the dataset:

inputrange_0 = []
for i in range(240):
    inputrange_0.append("value"+str(i))

bone_structure_histogram_mean = []
for i in input_data[inputrange_0]:
    bone_structure_histogram_mean.append(input_data[i].describe()["mean"])

bone_structure_histogram_std = []
for i in input_data[inputrange_0]:
    bone_structure_histogram_std.append(input_data[i].describe()["std"])

x = [i for i in range(len(bone_structure_histogram_mean))]
bones_lower_bound = np.subtract(bone_structure_histogram_mean,bone_structure_histogram_std)
bones_upper_bound = np.add(bone_structure_histogram_mean,bone_structure_histogram_std)

plt.figure(figsize=(10,7))
plt.plot(bone_structure_histogram_mean, color="black")
plt.fill_between(x,bones_lower_bound,bones_upper_bound,color="gray")
plt.xlabel("")
plt.ylabel("")
plt.title("Bone structure histogram - mean values and std")
plt.tight_layout()
plt.show()


inputrange_1 = []
for i in range(240,384):
    inputrange_1.append("value"+str(i))

air_inclusions_histogram_mean = []
for i in input_data[inputrange_1]:
    air_inclusions_histogram_mean.append(input_data[i].describe()["mean"])

air_inclusions_histogram_std = []
for i in input_data[inputrange_1]:
    air_inclusions_histogram_std.append(input_data[i].describe()["std"])

x = [i for i in range(len(air_inclusions_histogram_mean))]
air_lower_bound = np.subtract(air_inclusions_histogram_mean,air_inclusions_histogram_std)
air_upper_bound = np.add(air_inclusions_histogram_mean,air_inclusions_histogram_std)

plt.figure(figsize=(10,7))
plt.plot(air_inclusions_histogram_mean, color="black")
plt.fill_between(x,air_lower_bound,air_upper_bound,color="gray")
plt.xlabel("")
plt.ylabel("")
plt.title("Air inclusions histogram - mean values and std")
plt.tight_layout()
plt.show()

Both histograms seem to cover a wide variety of ranges.

Preprocessing and ML algorithms

Let’s do some more scaling to aim for fast convergence:

from sklearn.preprocessing import MaxAbsScaler
input_data.drop(["patientId"],axis=1, inplace=True)
input_data_scaled_df = input_data.copy()
scaler = MaxAbsScaler()
input_data_scaled = scaler.fit_transform(input_data)
input_data_scaled_df.loc[:,:] = input_data_scaled
display(input_data_scaled_df.head(3))

# We are dealing with physics/real-world meaning here, hence we need the unscaled values
extract_scaling_function = np.ones((1,input_data_scaled_df.shape[1]))
extract_scaling_function = scaler.inverse_transform(extract_scaling_function)


# split data into X and y
y_df = input_data_scaled_df['reference'].copy()
X_df = input_data_scaled_df.copy()
X_df.drop('reference', axis=1, inplace=True)

from sklearn.model_selection import GridSearchCV, train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_df.values, y_df.values,test_size=0.2, random_state=42, shuffle=True)

As machine learning algorithms, we can use exactly the same as used for this regression problem on predicting concrete compressive strength.

Results

After a few hours of training, we will end up with these results:

It looks like the deepest neural network predicts the CT slice position best.

References

F. Graf, H.-P. Kriegel, M. Schubert, S. Poelsterl, A. Cavallaro
2D Image Registration in CT Images using Radial Image Descriptors
In Medical Image Computing and Computer-Assisted Intervention (MICCAI),
Toronto, Canada, 2011.

	value6	value7	value8	...	value375	value376	value382	value383	reference
0	-0.25	-0.25	-0.25	...	-0.25	0.980381	-0.25	-0.25	21.803851
1	-0.25	-0.25	-0.25	...	-0.25	0.977008	-0.25	-0.25	21.745726
2	-0.25	-0.25	-0.25	...	-0.25	0.977008	-0.25	-0.25	21.687600
3	-0.25	-0.25	-0.25	...	-0.25	0.977008	-0.25	-0.25	21.629474
4	-0.25	-0.25	-0.25	...	-0.25	0.976833	-0.25	-0.25	21.571348

	patientId	value0	value1	value2	value3	value4	value5	value6	value7	value8	...	value375	value378	value379	value382	value383	reference
53495	96	0.591906	0.357764	0.000000	0.000000	0.552321	0.795304	0.946697	0.952227	0.84395	...	0.00	0.000000	0.000000	0.00	0.00	29.290398
53496	96	0.612313	0.000000	0.000000	0.000000	0.864160	0.820531	0.000000	0.938813	0.94374	...	0.00	0.000000	0.000000	0.00	0.00	27.945721
53497	96	0.612313	0.000000	0.000000	0.000000	0.864160	0.820531	0.000000	0.938813	0.94374	...	0.00	0.000000	0.000000	0.00	0.00	27.945721
53498	96	0.634921	0.904555	0.956087	0.980208	0.157664	0.000000	-0.250000	-0.250000	-0.25000	...	-0.25	0.994967	0.806688	-0.25	-0.25	14.582997
53499	96	0.654321	0.891021	0.882244	0.979282	0.000000	0.000000	-0.250000	-0.250000	-0.25000	...	-0.25	0.994671	0.000000	-0.25	-0.25	14.498955

	patientId	value0	value1	value2	value3	value4	value5	value6	value7	value8	...	value375	value376	value377	value378	value379	value380	value381	value382	value383	reference
count	53500.000000	53500.000000	53500.000000	53500.000000	53500.000000	53500.000000	53500.000000	53500.000000	53500.000000	53500.000000	...	53500.000000	53500.000000	53500.000000	53500.000000	53500.000000	53500.000000	53500.000000	53500.000000	53500.000000	53500.000000
mean	47.075701	0.059627	0.071558	0.145819	0.218728	0.274762	0.276189	0.204531	0.062281	-0.042025	...	-0.029404	0.182913	0.320112	0.359373	0.342889	0.266091	0.083049	-0.031146	-0.154524	47.028039
std	27.414240	0.174243	0.196921	0.300270	0.359163	0.378862	0.369605	0.351294	0.292232	0.268391	...	0.085817	0.383333	0.463517	0.478188	0.471811	0.437633	0.279734	0.098738	0.122491	22.347042
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	-0.250000	-0.250000	-0.250000	-0.250000	...	-0.250000	0.000000	0.000000	0.000000	0.000000	0.000000	-0.250000	-0.250000	-0.250000	1.738733
25%	23.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	-0.250000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	-0.250000	29.891607
50%	46.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	-0.250000	43.987893
75%	70.000000	0.000000	0.000000	0.000000	0.446429	0.684477	0.662382	0.441412	0.000000	0.000000	...	0.000000	0.000000	0.996286	0.999677	0.999560	0.949478	0.000000	0.000000	0.000000	63.735059
max	96.000000	1.000000	1.000000	1.000000	1.000000	0.998790	0.996468	0.999334	1.000000	1.000000	...	0.961279	1.000000	1.000000	1.000000	1.000000	1.000000	0.999857	0.996839	0.942851	97.489115