Revisiting Machine Learning Datasets

Let’s have a look at another dataset. This time, we’ll have a look at the Abalone dataset originating from work by Nash et al. (1994) and Waugh (1995).

Contents

Dataset exploration
Brute force
TPOT

This turned out to be one of the weirdest datasets I’ve seen - and yes, coming from the geosciences, I’m used to a lot of weird stuff.

Dataset exploration

The aim of this dataset is to predict the age of abalone without cutting them open. Let’s look at it. First, we have to load the dataset.

import time
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_squared_log_error, median_absolute_error
import sklearn.metrics
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from skgarden.mondrian import MondrianForestRegressor
import xgboost

names = ["Sex", "Length", "Diameter", "Height", "Whole weight", "Shucked weight", "Viscera weight", "Shell weight", "Rings"]

inputData = pd.read_csv("./data/abalone.data", names=names)
display(inputData.sample(10))



sexEncoded = []
for i in range(len(inputData["Sex"].values)):
    if inputData["Sex"].values[i] == "M":
        value=1
    elif inputData["Sex"].values[i] == "F":
        value=0
    else:
        value=-1
    sexEncoded.append(value)
inputData["Sex"] = sexEncoded

display(inputData.describe())


  
    
      
      Sex
      Length
      Diameter
      Height
      Whole weight
      Shucked weight
      Viscera weight
      Shell weight
      Rings
    
  
  
    
      686
      F
      0.535
      0.400
      0.150
      0.8045
      0.3345
      0.2125
      0.2100
      13
    
    
      644
      M
      0.450
      0.340
      0.130
      0.3715
      0.1605
      0.0795
      0.1050
      9
    
    
      2028
      F
      0.570
      0.435
      0.150
      0.8295
      0.3875
      0.1560
      0.2450
      10
    
    
      4047
      M
      0.620
      0.485
      0.145
      1.0030
      0.4655
      0.2195
      0.2800
      11
    
    
      2801
      M
      0.640
      0.515
      0.080
      1.0420
      0.5150
      0.1755
      0.1750
      10
    
    
      1118
      I
      0.535
      0.390
      0.125
      0.5990
      0.2595
      0.1490
      0.1690
      9
    
    
      3859
      F
      0.570
      0.440
      0.190
      1.0180
      0.4470
      0.2070
      0.2650
      9
    
    
      1209
      F
      0.780
      0.630
      0.215
      2.6570
      1.4880
      0.4985
      0.5860
      11
    
    
      4069
      I
      0.455
      0.335
      0.105
      0.4055
      0.1750
      0.0920
      0.1185
      8
    
    
      537
      M
      0.290
      0.230
      0.075
      0.1165
      0.0430
      0.0255
      0.0400
      7

	Sex	Length	Diameter	Height	Whole weight	Shucked weight	Viscera weight	Shell weight	Rings
686	F	0.535	0.400	0.150	0.8045	0.3345	0.2125	0.2100	13
644	M	0.450	0.340	0.130	0.3715	0.1605	0.0795	0.1050	9
2028	F	0.570	0.435	0.150	0.8295	0.3875	0.1560	0.2450	10
4047	M	0.620	0.485	0.145	1.0030	0.4655	0.2195	0.2800	11
2801	M	0.640	0.515	0.080	1.0420	0.5150	0.1755	0.1750	10
1118	I	0.535	0.390	0.125	0.5990	0.2595	0.1490	0.1690	9
3859	F	0.570	0.440	0.190	1.0180	0.4470	0.2070	0.2650	9
1209	F	0.780	0.630	0.215	2.6570	1.4880	0.4985	0.5860	11
4069	I	0.455	0.335	0.105	0.4055	0.1750	0.0920	0.1185	8
537	M	0.290	0.230	0.075	0.1165	0.0430	0.0255	0.0400	7


  
    
      
      Sex
      Length
      Diameter
      Height
      Whole weight
      Shucked weight
      Viscera weight
      Shell weight
      Rings
    
  
  
    
      2970
      1
      0.690
      0.515
      0.180
      1.8445
      0.9815
      0.4655
      0.3410
      13
    
    
      2522
      1
      0.545
      0.450
      0.150
      0.8795
      0.3870
      0.1500
      0.2625
      11
    
    
      3047
      1
      0.590
      0.435
      0.165
      0.9765
      0.4525
      0.2395
      0.2350
      9
    
    
      3118
      1
      0.510
      0.415
      0.145
      0.7510
      0.3295
      0.1835
      0.2030
      8
    
    
      828
      -1
      0.410
      0.325
      0.100
      0.3940
      0.2080
      0.0655
      0.1060
      6
    
    
      2755
      0
      0.525
      0.415
      0.150
      0.7055
      0.3290
      0.1470
      0.1990
      10
    
    
      3633
      -1
      0.300
      0.220
      0.065
      0.1195
      0.0520
      0.0155
      0.0350
      5
    
    
      1754
      1
      0.720
      0.550
      0.205
      2.1250
      1.1455
      0.4425
      0.5110
      13
    
    
      3560
      1
      0.570
      0.470
      0.155
      1.1860
      0.6355
      0.2315
      0.2770
      10
    
    
      3925
      0
      0.470
      0.350
      0.115
      0.4870
      0.1955
      0.1270
      0.1550
      8

	Sex	Length	Diameter	Height	Whole weight	Shucked weight	Viscera weight	Shell weight	Rings
2970	1	0.690	0.515	0.180	1.8445	0.9815	0.4655	0.3410	13
2522	1	0.545	0.450	0.150	0.8795	0.3870	0.1500	0.2625	11
3047	1	0.590	0.435	0.165	0.9765	0.4525	0.2395	0.2350	9
3118	1	0.510	0.415	0.145	0.7510	0.3295	0.1835	0.2030	8
828	-1	0.410	0.325	0.100	0.3940	0.2080	0.0655	0.1060	6
2755	0	0.525	0.415	0.150	0.7055	0.3290	0.1470	0.1990	10
3633	-1	0.300	0.220	0.065	0.1195	0.0520	0.0155	0.0350	5
1754	1	0.720	0.550	0.205	2.1250	1.1455	0.4425	0.5110	13
3560	1	0.570	0.470	0.155	1.1860	0.6355	0.2315	0.2770	10
3925	0	0.470	0.350	0.115	0.4870	0.1955	0.1270	0.1550	8

Let’s have a look at all individual classes:

Well, classes are really not distributed in a useful way. Many classes contain 1-2 classes and therefore are difficult to capture in training, validation and testing. This leads to some problems with scikit-learns’s internal class comparision. Therefore, we can’t use any classification algorithm without grouping them first - like many authors did. On the other hand, we want to predict the age. Let’s treat this as a regression problem instead - also strictly speaking age is of continuous value.

y = inputData["Rings"]
X = inputData.copy(deep=True)
X.drop(["Rings"], axis=1, inplace=True)

scaler = MaxAbsScaler()
scaler.fit(X)
X = scaler.transform(X)
X = pd.DataFrame(X, columns=names[:-1])



# #Test set performance (final 1044 examples, first 3133 used for training)
X_train = X.values[0:3134]
X_test = X.values[3134:]
y_train = y[0:3134]
y_test = y[3134:]



datasets = {}
datasets[0] = {'X_train' : X_train,
               'X_test': X_test,
               'y_train':y_train,
               'y_test' : y_test}

Brute force

It is time to have a look at the results:

The results are aweful ;).

TPOT

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error, mean_squared_error, mean_absolute_error
import sklearn.metrics
from tpot import TPOTRegressor



names = ["Sex", "Length", "Diameter", "Height", "Whole weight", "Shucked weight", "Viscera weight", "Shell weight", "Rings"]
inputData = pd.read_csv("./data/abalone.data", names=names)
sexEncoded = []
for i in range(len(inputData["Sex"].values)):
    if inputData["Sex"].values[i] == "M":
        value=1
    elif inputData["Sex"].values[i] == "F":
        value=0
    else:
        value=-1
    sexEncoded.append(value)
inputData["Sex"] = sexEncoded
y = inputData["Rings"]
X = inputData.copy(deep=True)
X.drop(["Rings"], axis=1, inplace=True)

scaler = MaxAbsScaler()
scaler.fit(X)
X = scaler.transform(X)
X = pd.DataFrame(X, columns=names[:-1])
X_train = X.values[0:3134]
X_test = X.values[3134:]
y_train = y[0:3134]
y_test = y[3134:]


tpot = TPOTRegressor(max_time_mins=60,
                     verbosity=1,
                     n_jobs=-1)
tpot.fit(X_train,y_train)
tpot.export('abalone.py')




y_predictions = tpot.predict(X_test)
r2 = sklearn.metrics.r2_score(y_test, y_predictions)
mae = sklearn.metrics.mean_absolute_error(y_test, y_predictions)
mse = sklearn.metrics.mean_squared_error(y_test.values, y_predictions)
rmse = np.sqrt(mse)
print("R2 score:", r2)
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)

60.51075633333333 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.
WARNING: TPOT may not provide a good pipeline if TPOT is stopped/interrupted in a early generation.


TPOT closed prematurely. Will use the current best pipeline.
R2 score: 0.5686918937966756
MAE: 1.493278634518192
MSE: 4.055867598889939
RMSE: 2.0139184687791953

That is equally aweful… .

Final pipeline:

import numpy as np
import pandas as pd
from sklearn.decomposition import FastICA
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.feature_selection import SelectFromModel, SelectPercentile, f_regression
from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=None)

# Average CV score on the training set was:-4.814505839419024
exported_pipeline = make_pipeline(
    SelectPercentile(score_func=f_regression, percentile=89),
    StackingEstimator(estimator=ElasticNetCV(l1_ratio=0.35000000000000003, tol=0.1)),
    SelectFromModel(estimator=ExtraTreesRegressor(max_features=0.6500000000000001, n_estimators=100), threshold=0.05),
    FastICA(tol=0.15000000000000002),
    RBFSampler(gamma=0.45),
    RandomForestRegressor(bootstrap=False, max_features=0.2, min_samples_leaf=20, min_samples_split=17, n_estimators=100)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)