Revisiting Machine Learning Datasets

Let’s have a look at the Arcene dataset as part of my exploring less known datasets series. A disclaimer upfront. I looked at this dataset for different purposes. Hence, I didn’t looked at any of the published papers on this dataset. Therefore, I haven’t checked any baselines etc..

Let’s dive in:

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

X_train = pd.read_csv("./data/arcene_train.data",
                      sep=" ",
                      header=None)
y_train = pd.read_csv("./data/arcene_train.labels",
                      sep=" ",
                      header=None)
X_test = pd.read_csv("./data/arcene_valid.data",
                      sep=" ",
                      header=None)
y_test = pd.read_csv("./data/arcene_valid.labels",
                      sep=" ",
                      header=None)

X_train.describe()

X_test.describe()

It looks like only the last column contains missing values (NaN). We can check that:

print(X_train.isna().values.any())
print(X_test.isna().values.any())
X_train.drop([10000], inplace=True, axis=1)
X_test.drop([10000], inplace=True, axis=1)
print(X_train.isna().values.any())
print(X_test.isna().values.any())

True
True
False
False

It is time to see how the target classes are distributed:

y_hist_train_dict = dict(y_train[0].value_counts())
y_hist_test_dict = dict(y_test[0].value_counts())

plt.figure(figsize=(7,5))
plt.bar(list(y_hist_train_dict.keys()), y_hist_train_dict.values(), color="black")
plt.title("Arcene classes histogram - train set")
plt.ylabel("Count")
plt.xlabel("Label")
plt.tight_layout()
plt.show()

plt.figure(figsize=(7,5))
plt.bar(list(y_hist_test_dict.keys()), y_hist_test_dict.values(), color="black")
plt.title("Arcene classes histogram - test (valid) set")
plt.ylabel("Count")
plt.xlabel("Label")
plt.tight_layout()
plt.show()

That looks good!

Let’s see of we can identify clear differences between both classes visually.

Aaaah, way to many input features. Let’s hope that some scaling will help:

from sklearn.preprocessing import MaxAbsScaler

X_train = X_train.astype('float64')
X_test = X_test.astype('float64')

scaler = MaxAbsScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

Okay, that doesn’t help. Either we have to do a lot of feature reduction or see with what ML algorithms come up with:

from sklearn.metrics import accuracy_score, classification_report
import sklearn.metrics
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from sklearn.metrics import accuracy_score, classification_report
from skgarden import MondrianForestClassifier
import xgboost

def train_test_Gaussian_NB_classification(X_train, X_test, y_train, y_test,scorer, dataset_id):
    Gaussian_NB_classification = GaussianNB()
    grid_obj = sklearn.model_selection.GridSearchCV(Gaussian_NB_classification,
                                                          param_grid={},
                                                          cv=4,
                                                          n_jobs=-1,
                                                          scoring=scorer)
    start_time = time.time()
    grid_fit = grid_obj.fit(X_train, y_train)
    training_time = time.time() - start_time
    prediction = grid_fit.predict(X_test)
    accuracy = sklearn.metrics.accuracy_score(y_true=y_test, y_pred=prediction)
    classification_rep = sklearn.metrics.classification_report(y_true=y_test, y_pred=prediction)
    

    return {'Classification type' : 'Gaussian Naive Bayes Classification',
            'model' : grid_fit,
            'Predictions' : prediction,
            'Accuracy' : accuracy,
            'Classification Report':classification_rep,
            'Training time' : training_time,
            'dataset' : dataset_id}


def train_test_decision_tree_classification(X_train, X_test,
                                            y_train, y_test,
                                            scorer, dataset_id):
    decision_tree_classification = DecisionTreeClassifier(random_state=42)
    grid_parameters_decision_tree_classification = {'max_depth' : [None, 3,5,7,9,10,11,100]}
    start_time = time.time()
    grid_obj = GridSearchCV(decision_tree_classification,
                            param_grid=grid_parameters_decision_tree_classification,
                            cv=4, n_jobs=-1,
                            scoring=scorer, verbose=1)
    grid_fit = grid_obj.fit(X_train, y_train)
    training_time = time.time() - start_time
    best_decision_tree_classification = grid_fit.best_estimator_
    prediction = best_decision_tree_classification.predict(X_test)
    accuracy = accuracy_score(y_true=y_test, y_pred=prediction)
    classification_rep = classification_report(y_true=y_test, y_pred=prediction)

    return {'Classification type' : 'Decision Tree Classification',
            'model' : grid_fit,
            'Predictions' : prediction,
            'Accuracy' : accuracy,
            'Classification Report':classification_rep,
            'Training time' : training_time,
            'dataset' : dataset_id}




def train_test_random_forest_classification(X_train, X_test,
                                            y_train, y_test,
                                            scorer, dataset_id):
    random_forest_classification = RandomForestClassifier(random_state=42)
    # libsvm is quite slow 
    grid_parameters_random_forest_classification = {'n_estimators' : [3,5,10,15,18,100],
                                     'max_depth' : [None, 2,3,5,7,9]}
    start_time = time.time()
    grid_obj = GridSearchCV(random_forest_classification,
                            param_grid=grid_parameters_random_forest_classification,
                            cv=4, n_jobs=-1,
                            scoring=scorer, verbose=1)
    grid_fit = grid_obj.fit(X_train, y_train)
    training_time = time.time() - start_time
    best_random_forest_classifier = grid_fit.best_estimator_
    prediction = best_random_forest_classifier.predict(X_test)
    accuracy = accuracy_score(y_true=y_test, y_pred=prediction)
    classification_rep = classification_report(y_true=y_test, y_pred=prediction)
    
    return {'Classification type' : 'Random Forest Classification',
            'model' : grid_fit,
            'Predictions' : prediction,
            'Accuracy' : accuracy,
            'Classification Report':classification_rep,
            'Training time' : training_time,
            'dataset' : dataset_id}

def train_test_mondrian_forest_classification(X_train, X_test,
                                            y_train, y_test,
                                            scorer, dataset_id):
    random_forest_classification = MondrianForestClassifier(random_state=42)
    # libsvm is quite slow 
    grid_parameters_random_forest_classification = {'n_estimators' : [3,5,10,15,18,30,100],
                                     'max_depth' : [None, 2,3,5,7,9]}
    start_time = time.time()
    grid_obj = GridSearchCV(random_forest_classification,
                            param_grid=grid_parameters_random_forest_classification,
                            cv=4, n_jobs=-1,
                            scoring=scorer, verbose=1)
    grid_fit = grid_obj.fit(X_train, y_train)
    training_time = time.time() - start_time
    best_random_forest_classifier = grid_fit.best_estimator_
    prediction = best_random_forest_classifier.predict(X_test)
    accuracy = accuracy_score(y_true=y_test, y_pred=prediction)
    classification_rep = classification_report(y_true=y_test, y_pred=prediction)
    
    return {'Classification type' : 'Mondrian Forest Classification',
            'model' : grid_fit,
            'Predictions' : prediction,
            'Accuracy' : accuracy,
            'Classification Report':classification_rep,
            'Training time' : training_time,
            'dataset' : dataset_id}


def xgboost_classification(X_train, X_test,
                           y_train, y_test,
                           scorer, dataset_id):
    x_gradient_boosting_classification = xgboost.XGBClassifier(silent=True, random_state=42)
    grid_parameters_x_gradient_boosting_classification = {'n_estimators' : [3,5,18,20,60,80,150],
                                                    'max_depth' : [1,2,7,9,15,20],
                                                    'learning_rate' :[0.001, 0.01, 0.1],
                                                    'booster' : ['gbtree', 'dart']}
    start_time = time.time()
    grid_obj = GridSearchCV(x_gradient_boosting_classification,
                            param_grid=grid_parameters_x_gradient_boosting_classification,
                            cv=4, n_jobs=-1,
                            scoring=scorer, verbose=1)
    grid_fit = grid_obj.fit(X_train, y_train)
    training_time = time.time() - start_time
    best_gradient_boosting_classification = grid_fit.best_estimator_
    prediction = best_gradient_boosting_classification.predict(X_test)
    accuracy = accuracy_score(y_true=y_test, y_pred=prediction)
    classification_rep = classification_report(y_true=y_test, y_pred=prediction)
    
    return {'Classification type' : 'XGBoost Classification', 
            'model' : grid_fit,
            'Predictions' : prediction,
            'Accuracy' : accuracy,
            'Classification Report':classification_rep,
            'Training time' : training_time,
            'dataset' : dataset_id}


results = {}
counter = 0
# make scorer
scorer = 'neg_log_loss'

for dataset in [0]:
    results[counter] = train_test_Gaussian_NB_classification(X_train, X_test, y_train, y_test,scorer,dataset)
    counter += 1
    print('Naive Bayes completed')
    results[counter] = train_test_decision_tree_classification(X_train, X_test, y_train, y_test,scorer,dataset)
    counter += 1
    print('Decision Trees completed')
    results[counter] = train_test_random_forest_classification(X_train, X_test, y_train, y_test,scorer,dataset)
    counter += 1
    print('random forest completed')
    results[counter] = train_test_mondrian_forest_classification(X_train, X_test, y_train, y_test,scorer,dataset)
    counter += 1
    print('mondrian forest completed')
    results[counter] = xgboost_classification(X_train, X_test, y_train, y_test,scorer,dataset)
    counter += 1
    print('XGBoost completed')

And?

That is an aweful result. Unfortunately, I’m lacking time really looking into this dataset right now. Perhaps I’ll revisit it some time in the future.

	0	1	2	3	4	5	6	7	8	9	...	9991	9992	9993	9994	9995	9996	9997	9998	9999	10000
count	100.000000	100.000000	100.00000	100.000000	100.000000	100.000000	100.000000	100.000000	100.000000	100.000000	...	100.000000	100.000000	100.000000	100.000000	100.000000	100.000000	100.000000	100.000000	100.000000	0.0
mean	37.060000	35.330000	56.50000	329.860000	17.110000	412.090000	232.330000	6.510000	48.170000	63.690000	...	396.480000	45.920000	18.210000	106.070000	165.350000	197.650000	0.360000	88.530000	365.750000	NaN
std	49.763647	37.690943	68.64011	164.264508	26.348057	108.375152	186.106142	13.149102	57.624009	91.626349	...	233.176462	47.886161	23.914112	141.747054	145.380752	113.122118	2.110675	109.698619	106.055386	NaN
min	0.000000	0.000000	0.00000	27.000000	0.000000	181.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	41.000000	0.000000	0.000000	0.000000	NaN
25%	0.000000	0.000000	0.00000	213.750000	0.000000	314.250000	0.000000	0.000000	0.000000	0.000000	...	309.000000	0.000000	0.000000	0.000000	0.000000	94.500000	0.000000	0.000000	314.500000	NaN
50%	15.000000	25.000000	19.50000	333.500000	0.000000	441.000000	266.000000	0.000000	30.000000	0.500000	...	468.500000	34.500000	2.500000	48.500000	186.000000	167.000000	0.000000	38.500000	364.500000	NaN
75%	57.500000	59.750000	107.25000	443.000000	29.000000	501.250000	422.750000	4.500000	79.250000	125.000000	...	579.250000	75.250000	34.250000	140.000000	276.500000	295.750000	0.000000	165.500000	446.500000	NaN
max	188.000000	218.000000	236.00000	677.000000	99.000000	583.000000	493.000000	55.000000	202.000000	391.000000	...	654.000000	195.000000	83.000000	656.000000	485.000000	422.000000	18.000000	392.000000	533.000000	NaN

	0	1	2	3	4	5	6	7	8	9	...	9991	9992	9993	9994	9995	9996	9997	9998	9999	10000
count	100.000000	100.000000	100.000000	100.0000	100.00000	100.000000	100.000000	100.000000	100.000000	100.000000	...	100.000000	100.000000	100.000000	100.000000	100.000000	100.000000	100.000000	100.000000	100.00000	0.0
mean	33.710000	38.780000	64.110000	320.9900	17.00000	409.790000	220.290000	8.880000	49.420000	77.840000	...	370.230000	46.380000	18.740000	108.270000	190.390000	206.760000	1.020000	87.400000	357.55000	NaN
std	47.918343	38.328611	73.758669	146.2053	27.95487	103.450891	177.575666	16.436151	58.984535	101.561219	...	237.223198	50.228926	26.122717	173.086062	139.408556	123.901975	4.256854	113.776052	104.24557	NaN
min	0.000000	0.000000	0.000000	0.0000	0.00000	219.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	31.00000	NaN
25%	0.000000	1.000000	0.000000	209.0000	0.00000	309.000000	0.000000	0.000000	0.000000	0.000000	...	153.000000	0.000000	0.000000	0.000000	0.000000	91.500000	0.000000	0.000000	312.00000	NaN
50%	10.000000	32.000000	29.500000	334.0000	0.00000	427.000000	217.500000	0.000000	25.000000	15.500000	...	427.000000	37.000000	0.000000	41.500000	228.500000	195.000000	0.000000	38.000000	358.00000	NaN
75%	49.250000	59.250000	128.750000	428.0000	31.25000	487.500000	396.750000	11.250000	80.750000	145.000000	...	570.000000	81.000000	38.500000	113.750000	284.250000	323.250000	0.000000	151.250000	441.50000	NaN
max	194.000000	184.000000	240.000000	705.0000	109.00000	635.000000	501.000000	74.000000	201.000000	425.000000	...	692.000000	200.000000	90.000000	646.000000	509.000000	423.000000	24.000000	491.000000	527.00000	NaN