Let’s have a look at the Arcene dataset as part of my exploring less known datasets series. A disclaimer upfront. I looked at this dataset for different purposes. Hence, I didn’t looked at any of the published papers on this dataset. Therefore, I haven’t checked any baselines etc..
Let’s dive in:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
X_train = pd.read_csv("./data/arcene_train.data",
sep=" ",
header=None)
y_train = pd.read_csv("./data/arcene_train.labels",
sep=" ",
header=None)
X_test = pd.read_csv("./data/arcene_valid.data",
sep=" ",
header=None)
y_test = pd.read_csv("./data/arcene_valid.labels",
sep=" ",
header=None)
X_train.describe()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 9991 | 9992 | 9993 | 9994 | 9995 | 9996 | 9997 | 9998 | 9999 | 10000 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 100.000000 | 100.000000 | 100.00000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 | ... | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 0.0 |
mean | 37.060000 | 35.330000 | 56.50000 | 329.860000 | 17.110000 | 412.090000 | 232.330000 | 6.510000 | 48.170000 | 63.690000 | ... | 396.480000 | 45.920000 | 18.210000 | 106.070000 | 165.350000 | 197.650000 | 0.360000 | 88.530000 | 365.750000 | NaN |
std | 49.763647 | 37.690943 | 68.64011 | 164.264508 | 26.348057 | 108.375152 | 186.106142 | 13.149102 | 57.624009 | 91.626349 | ... | 233.176462 | 47.886161 | 23.914112 | 141.747054 | 145.380752 | 113.122118 | 2.110675 | 109.698619 | 106.055386 | NaN |
min | 0.000000 | 0.000000 | 0.00000 | 27.000000 | 0.000000 | 181.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 41.000000 | 0.000000 | 0.000000 | 0.000000 | NaN |
25% | 0.000000 | 0.000000 | 0.00000 | 213.750000 | 0.000000 | 314.250000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 309.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 94.500000 | 0.000000 | 0.000000 | 314.500000 | NaN |
50% | 15.000000 | 25.000000 | 19.50000 | 333.500000 | 0.000000 | 441.000000 | 266.000000 | 0.000000 | 30.000000 | 0.500000 | ... | 468.500000 | 34.500000 | 2.500000 | 48.500000 | 186.000000 | 167.000000 | 0.000000 | 38.500000 | 364.500000 | NaN |
75% | 57.500000 | 59.750000 | 107.25000 | 443.000000 | 29.000000 | 501.250000 | 422.750000 | 4.500000 | 79.250000 | 125.000000 | ... | 579.250000 | 75.250000 | 34.250000 | 140.000000 | 276.500000 | 295.750000 | 0.000000 | 165.500000 | 446.500000 | NaN |
max | 188.000000 | 218.000000 | 236.00000 | 677.000000 | 99.000000 | 583.000000 | 493.000000 | 55.000000 | 202.000000 | 391.000000 | ... | 654.000000 | 195.000000 | 83.000000 | 656.000000 | 485.000000 | 422.000000 | 18.000000 | 392.000000 | 533.000000 | NaN |
X_test.describe()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 9991 | 9992 | 9993 | 9994 | 9995 | 9996 | 9997 | 9998 | 9999 | 10000 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 100.000000 | 100.000000 | 100.000000 | 100.0000 | 100.00000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 | ... | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 100.00000 | 0.0 |
mean | 33.710000 | 38.780000 | 64.110000 | 320.9900 | 17.00000 | 409.790000 | 220.290000 | 8.880000 | 49.420000 | 77.840000 | ... | 370.230000 | 46.380000 | 18.740000 | 108.270000 | 190.390000 | 206.760000 | 1.020000 | 87.400000 | 357.55000 | NaN |
std | 47.918343 | 38.328611 | 73.758669 | 146.2053 | 27.95487 | 103.450891 | 177.575666 | 16.436151 | 58.984535 | 101.561219 | ... | 237.223198 | 50.228926 | 26.122717 | 173.086062 | 139.408556 | 123.901975 | 4.256854 | 113.776052 | 104.24557 | NaN |
min | 0.000000 | 0.000000 | 0.000000 | 0.0000 | 0.00000 | 219.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 31.00000 | NaN |
25% | 0.000000 | 1.000000 | 0.000000 | 209.0000 | 0.00000 | 309.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 153.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 91.500000 | 0.000000 | 0.000000 | 312.00000 | NaN |
50% | 10.000000 | 32.000000 | 29.500000 | 334.0000 | 0.00000 | 427.000000 | 217.500000 | 0.000000 | 25.000000 | 15.500000 | ... | 427.000000 | 37.000000 | 0.000000 | 41.500000 | 228.500000 | 195.000000 | 0.000000 | 38.000000 | 358.00000 | NaN |
75% | 49.250000 | 59.250000 | 128.750000 | 428.0000 | 31.25000 | 487.500000 | 396.750000 | 11.250000 | 80.750000 | 145.000000 | ... | 570.000000 | 81.000000 | 38.500000 | 113.750000 | 284.250000 | 323.250000 | 0.000000 | 151.250000 | 441.50000 | NaN |
max | 194.000000 | 184.000000 | 240.000000 | 705.0000 | 109.00000 | 635.000000 | 501.000000 | 74.000000 | 201.000000 | 425.000000 | ... | 692.000000 | 200.000000 | 90.000000 | 646.000000 | 509.000000 | 423.000000 | 24.000000 | 491.000000 | 527.00000 | NaN |
It looks like only the last column contains missing values (NaN). We can check that:
print(X_train.isna().values.any())
print(X_test.isna().values.any())
X_train.drop([10000], inplace=True, axis=1)
X_test.drop([10000], inplace=True, axis=1)
print(X_train.isna().values.any())
print(X_test.isna().values.any())
True
True
False
False
It is time to see how the target classes are distributed:
y_hist_train_dict = dict(y_train[0].value_counts())
y_hist_test_dict = dict(y_test[0].value_counts())
plt.figure(figsize=(7,5))
plt.bar(list(y_hist_train_dict.keys()), y_hist_train_dict.values(), color="black")
plt.title("Arcene classes histogram - train set")
plt.ylabel("Count")
plt.xlabel("Label")
plt.tight_layout()
plt.show()
plt.figure(figsize=(7,5))
plt.bar(list(y_hist_test_dict.keys()), y_hist_test_dict.values(), color="black")
plt.title("Arcene classes histogram - test (valid) set")
plt.ylabel("Count")
plt.xlabel("Label")
plt.tight_layout()
plt.show()
That looks good!
Let’s see of we can identify clear differences between both classes visually.
Aaaah, way to many input features. Let’s hope that some scaling will help:
from sklearn.preprocessing import MaxAbsScaler
X_train = X_train.astype('float64')
X_test = X_test.astype('float64')
scaler = MaxAbsScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
Okay, that doesn’t help. Either we have to do a lot of feature reduction or see with what ML algorithms come up with:
from sklearn.metrics import accuracy_score, classification_report
import sklearn.metrics
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from sklearn.metrics import accuracy_score, classification_report
from skgarden import MondrianForestClassifier
import xgboost
def train_test_Gaussian_NB_classification(X_train, X_test, y_train, y_test,scorer, dataset_id):
Gaussian_NB_classification = GaussianNB()
grid_obj = sklearn.model_selection.GridSearchCV(Gaussian_NB_classification,
param_grid={},
cv=4,
n_jobs=-1,
scoring=scorer)
start_time = time.time()
grid_fit = grid_obj.fit(X_train, y_train)
training_time = time.time() - start_time
prediction = grid_fit.predict(X_test)
accuracy = sklearn.metrics.accuracy_score(y_true=y_test, y_pred=prediction)
classification_rep = sklearn.metrics.classification_report(y_true=y_test, y_pred=prediction)
return {'Classification type' : 'Gaussian Naive Bayes Classification',
'model' : grid_fit,
'Predictions' : prediction,
'Accuracy' : accuracy,
'Classification Report':classification_rep,
'Training time' : training_time,
'dataset' : dataset_id}
def train_test_decision_tree_classification(X_train, X_test,
y_train, y_test,
scorer, dataset_id):
decision_tree_classification = DecisionTreeClassifier(random_state=42)
grid_parameters_decision_tree_classification = {'max_depth' : [None, 3,5,7,9,10,11,100]}
start_time = time.time()
grid_obj = GridSearchCV(decision_tree_classification,
param_grid=grid_parameters_decision_tree_classification,
cv=4, n_jobs=-1,
scoring=scorer, verbose=1)
grid_fit = grid_obj.fit(X_train, y_train)
training_time = time.time() - start_time
best_decision_tree_classification = grid_fit.best_estimator_
prediction = best_decision_tree_classification.predict(X_test)
accuracy = accuracy_score(y_true=y_test, y_pred=prediction)
classification_rep = classification_report(y_true=y_test, y_pred=prediction)
return {'Classification type' : 'Decision Tree Classification',
'model' : grid_fit,
'Predictions' : prediction,
'Accuracy' : accuracy,
'Classification Report':classification_rep,
'Training time' : training_time,
'dataset' : dataset_id}
def train_test_random_forest_classification(X_train, X_test,
y_train, y_test,
scorer, dataset_id):
random_forest_classification = RandomForestClassifier(random_state=42)
# libsvm is quite slow
grid_parameters_random_forest_classification = {'n_estimators' : [3,5,10,15,18,100],
'max_depth' : [None, 2,3,5,7,9]}
start_time = time.time()
grid_obj = GridSearchCV(random_forest_classification,
param_grid=grid_parameters_random_forest_classification,
cv=4, n_jobs=-1,
scoring=scorer, verbose=1)
grid_fit = grid_obj.fit(X_train, y_train)
training_time = time.time() - start_time
best_random_forest_classifier = grid_fit.best_estimator_
prediction = best_random_forest_classifier.predict(X_test)
accuracy = accuracy_score(y_true=y_test, y_pred=prediction)
classification_rep = classification_report(y_true=y_test, y_pred=prediction)
return {'Classification type' : 'Random Forest Classification',
'model' : grid_fit,
'Predictions' : prediction,
'Accuracy' : accuracy,
'Classification Report':classification_rep,
'Training time' : training_time,
'dataset' : dataset_id}
def train_test_mondrian_forest_classification(X_train, X_test,
y_train, y_test,
scorer, dataset_id):
random_forest_classification = MondrianForestClassifier(random_state=42)
# libsvm is quite slow
grid_parameters_random_forest_classification = {'n_estimators' : [3,5,10,15,18,30,100],
'max_depth' : [None, 2,3,5,7,9]}
start_time = time.time()
grid_obj = GridSearchCV(random_forest_classification,
param_grid=grid_parameters_random_forest_classification,
cv=4, n_jobs=-1,
scoring=scorer, verbose=1)
grid_fit = grid_obj.fit(X_train, y_train)
training_time = time.time() - start_time
best_random_forest_classifier = grid_fit.best_estimator_
prediction = best_random_forest_classifier.predict(X_test)
accuracy = accuracy_score(y_true=y_test, y_pred=prediction)
classification_rep = classification_report(y_true=y_test, y_pred=prediction)
return {'Classification type' : 'Mondrian Forest Classification',
'model' : grid_fit,
'Predictions' : prediction,
'Accuracy' : accuracy,
'Classification Report':classification_rep,
'Training time' : training_time,
'dataset' : dataset_id}
def xgboost_classification(X_train, X_test,
y_train, y_test,
scorer, dataset_id):
x_gradient_boosting_classification = xgboost.XGBClassifier(silent=True, random_state=42)
grid_parameters_x_gradient_boosting_classification = {'n_estimators' : [3,5,18,20,60,80,150],
'max_depth' : [1,2,7,9,15,20],
'learning_rate' :[0.001, 0.01, 0.1],
'booster' : ['gbtree', 'dart']}
start_time = time.time()
grid_obj = GridSearchCV(x_gradient_boosting_classification,
param_grid=grid_parameters_x_gradient_boosting_classification,
cv=4, n_jobs=-1,
scoring=scorer, verbose=1)
grid_fit = grid_obj.fit(X_train, y_train)
training_time = time.time() - start_time
best_gradient_boosting_classification = grid_fit.best_estimator_
prediction = best_gradient_boosting_classification.predict(X_test)
accuracy = accuracy_score(y_true=y_test, y_pred=prediction)
classification_rep = classification_report(y_true=y_test, y_pred=prediction)
return {'Classification type' : 'XGBoost Classification',
'model' : grid_fit,
'Predictions' : prediction,
'Accuracy' : accuracy,
'Classification Report':classification_rep,
'Training time' : training_time,
'dataset' : dataset_id}
results = {}
counter = 0
# make scorer
scorer = 'neg_log_loss'
for dataset in [0]:
results[counter] = train_test_Gaussian_NB_classification(X_train, X_test, y_train, y_test,scorer,dataset)
counter += 1
print('Naive Bayes completed')
results[counter] = train_test_decision_tree_classification(X_train, X_test, y_train, y_test,scorer,dataset)
counter += 1
print('Decision Trees completed')
results[counter] = train_test_random_forest_classification(X_train, X_test, y_train, y_test,scorer,dataset)
counter += 1
print('random forest completed')
results[counter] = train_test_mondrian_forest_classification(X_train, X_test, y_train, y_test,scorer,dataset)
counter += 1
print('mondrian forest completed')
results[counter] = xgboost_classification(X_train, X_test, y_train, y_test,scorer,dataset)
counter += 1
print('XGBoost completed')
And?
That is an aweful result. Unfortunately, I’m lacking time really looking into this dataset right now. Perhaps I’ll revisit it some time in the future.