Let’s have a look at the ionosphere dataset as part of my exploring less known datasets series. The dataset originates from Sigillito et al. (1989): Classification of radar returns from the ionosphere using neural networks and is hosted on the UCI Machine Learning Repository.
Contents
Dataset exploration
The dataset consists of radar scans of the ionosphere from a radar array based on Goose Bay. It consists of two classes. “Good” means that some structure was detected in the ionosphere, “bad” means that nothing was detected.
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from skgarden import MondrianForestClassifier
import xgboost
names = [str(i) for i in range(34)]
names.append("target")
inputData = pd.read_csv("./data/ionosphere.data",
names=names)
display(inputData.sample(5))
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
152 | 1 | 0 | 0.05866 | -0.00838 | 0.06704 | 0.00838 | 0.00000 | -0.01117 | 0.00559 | -0.03911 | ... | 0.00559 | 0.10335 | -0.00838 | 0.03073 | -0.00279 | 0.04469 | 0.00000 | 0.04749 | -0.03352 | b |
270 | 1 | 0 | 1.00000 | 0.08013 | 0.96775 | -0.00482 | 0.96683 | -0.00722 | 0.87980 | -0.03923 | ... | 0.02003 | 0.93772 | -0.03034 | 1.00000 | -0.05843 | 0.92774 | -0.03464 | 0.92226 | -0.03673 | g |
172 | 0 | 0 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | ... | 0.00000 | 1.00000 | -1.00000 | -1.00000 | -1.00000 | 1.00000 | 1.00000 | 0.00000 | 0.00000 | b |
149 | 1 | 0 | 0.90374 | -0.01604 | 1.00000 | 0.08021 | 1.00000 | 0.01604 | 0.93048 | 0.00535 | ... | 0.05348 | 0.96974 | 0.04452 | 0.87701 | 0.01070 | 1.00000 | 0.09091 | 0.97861 | 0.06417 | g |
310 | 1 | 0 | 0.93658 | 0.35107 | 0.75254 | 0.65640 | 0.45571 | 0.88576 | 0.15323 | 0.95776 | ... | -0.84951 | -0.04578 | -0.91221 | 0.27330 | -0.85762 | 0.54827 | -0.69613 | 0.74828 | -0.44173 | g |
targetEncoded = []
for i in range(len(inputData["target"].values)):
if inputData["target"].values[i] == "g":
value=1
else:
value=0
targetEncoded.append(value)
inputData["target_encoded"] = targetEncoded
inputData.drop(["target"], axis=1,inplace=True)
Let’s have a look at a per-class example:
Classes are not evenly distributed:
However, if we follow the original dataset description, then we have an equally distributed training set.
y = inputData["target_encoded"].values
X = inputData.copy(deep=True)
X.drop(["target_encoded"], axis=1, inplace=True)
X_train = X.values[0:200]
X_test = X.values[200:]
y_train = y[0:200]
y_test = y[200:]
Brute force approach
The baseline result is an accuracy of > 96%. Let’s see if we can beat it.
Well, it seems like Mondrian forests is the best solution for this classification problem. However, it took the longest time to train it ;):
NB!: Training was done on a quite slow notebook.
TPOT
Let’s see if TPOT comes up with something better.
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import sklearn.metrics
from tpot import TPOTClassifier
names = [str(i) for i in range(34)]
names.append("target")
inputData = pd.read_csv("./data/ionosphere.data",
names=names)
targetEncoded = []
for i in range(len(inputData["target"].values)):
if inputData["target"].values[i] == "g":
value=1
else:
value=0
targetEncoded.append(value)
inputData["target_encoded"] = targetEncoded
inputData.drop(["target"], axis=1,inplace=True)
y = inputData["target_encoded"].values
X = inputData.copy(deep=True)
X.drop(["target_encoded"], axis=1, inplace=True)
X_train = X.values[0:200]
X_test = X.values[200:]
y_train = y[0:200]
y_test = y[200:]
from tpot import TPOTClassifier
tpot = TPOTClassifier(max_time_mins=60,
verbosity=1,
n_jobs=-1)
tpot.fit(X_train,y_train)
print(tpot.score(X_test,y_test))
tpot.export('ionosphere_radar_signal_classification.py')
60.12425203333333 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.
WARNING: TPOT may not provide a good pipeline if TPOT is stopped/interrupted in a early generation.
TPOT closed prematurely. Will use the current best pipeline.
Best pipeline: ExtraTreesClassifier(PCA(Normalizer(input_matrix, norm=l1), iterated_power=4, svd_solver=randomized), bootstrap=False, criterion=entropy, max_features=0.5, min_samples_leaf=1, min_samples_split=11, n_estimators=100)
0.9668874172185431
That looks worse than brute-forcing it, especially if we consider a few seconds vs 60 mins of run-time.
The final TPOT pipeline after 60 mins is:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
train_test_split(features, tpot_data['target'].values, random_state=None)
# Average CV score on the training set was:0.9497373358348968
exported_pipeline = make_pipeline(
Normalizer(norm="l1"),
PCA(iterated_power=4, svd_solver="randomized"),
ExtraTreesClassifier(bootstrap=False, criterion="entropy", max_features=0.5, min_samples_leaf=1, min_samples_split=11, n_estimators=100)
)
exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)