Revisiting ML - Forest Type Classification

It’s time to have a look at another dataset. We’ll have a look at more traditional land use classification this time. In fact this dataset is about forest type classification. The dataset is hosted on the UCI Machine Learning Repository and originates from research by Johnson et al. (2012).

Let’s load the dataset and look at it:

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

input_data_train = pd.read_csv("./data/training.csv")
input_data_test = pd.read_csv("./data/testing.csv")

display(input_data_train.head(2))
display(input_data_test.head(2))
display(input_data_train.describe())


  
    
      
      class
      b1
      b2
      b3
      b4
      b5
      b6
      b7
      b8
      b9
      ...
      pred_minus_obs_H_b9
      pred_minus_obs_S_b1
      pred_minus_obs_S_b2
      pred_minus_obs_S_b3
      pred_minus_obs_S_b4
      pred_minus_obs_S_b5
      pred_minus_obs_S_b6
      pred_minus_obs_S_b7
      pred_minus_obs_S_b8
      pred_minus_obs_S_b9
    
  
  
    
      0
      d
      39
      36
      57
      91
      59
      101
      93
      27
      60
      ...
      -2.36
      -18.41
      -1.88
      -6.43
      -21.03
      -1.60
      -6.18
      -22.50
      -5.20
      -7.86
    
    
      1
      h
      84
      30
      57
      112
      51
      98
      92
      26
      62
      ...
      -2.26
      -16.27
      -1.95
      -6.25
      -18.79
      -1.99
      -6.18
      -23.41
      -8.87
      -10.83


  
    
      
      class
      b1
      b2
      b3
      b4
      b5
      b6
      b7
      b8
      b9
      ...
      pred_minus_obs_H_b9
      pred_minus_obs_S_b1
      pred_minus_obs_S_b2
      pred_minus_obs_S_b3
      pred_minus_obs_S_b4
      pred_minus_obs_S_b5
      pred_minus_obs_S_b6
      pred_minus_obs_S_b7
      pred_minus_obs_S_b8
      pred_minus_obs_S_b9
    
  
  
    
      0
      d
      67
      51
      68
      115
      69
      111
      136
      31
      67
      ...
      -9.17
      -18.27
      -1.80
      -6.32
      -20.88
      -1.63
      -6.13
      -22.56
      -5.53
      -8.11
    
    
      1
      s
      67
      28
      51
      99
      50
      97
      82
      26
      59
      ...
      -2.25
      -20.13
      -2.11
      -6.35
      -21.94
      -1.22
      -6.13
      -22.20
      -3.41
      -6.57


  
    
      
      b1
      b2
      b3
      b4
      b5
      b6
      b7
      b8
      b9
      pred_minus_obs_H_b1
      ...
      pred_minus_obs_H_b9
      pred_minus_obs_S_b1
      pred_minus_obs_S_b2
      pred_minus_obs_S_b3
      pred_minus_obs_S_b4
      pred_minus_obs_S_b5
      pred_minus_obs_S_b6
      pred_minus_obs_S_b7
      pred_minus_obs_S_b8
      pred_minus_obs_S_b9
    
  
  
    
      count
      198.000000
      198.000000
      198.000000
      198.000000
      198.000000
      198.000000
      198.000000
      198.000000
      198.000000
      198.000000
      ...
      198.000000
      198.000000
      198.000000
      198.000000
      198.000000
      198.000000
      198.000000
      198.000000
      198.000000
      198.000000
    
    
      mean
      62.949495
      41.020202
      63.676768
      101.409091
      58.732323
      100.651515
      90.601010
      28.691919
      61.116162
      50.818889
      ...
      -5.594141
      -20.037576
      -1.007121
      -4.355657
      -20.996919
      -0.973737
      -4.597626
      -18.840000
      -1.570808
      -4.155859
    
    
      std
      12.779563
      17.832543
      17.314545
      14.804627
      12.392648
      11.190314
      15.588861
      8.977752
      9.787158
      12.842321
      ...
      9.769193
      4.948562
      1.783671
      2.352311
      6.490763
      0.702619
      1.736712
      5.251095
      1.807792
      1.982423
    
    
      min
      34.000000
      25.000000
      47.000000
      54.000000
      44.000000
      84.000000
      54.000000
      21.000000
      50.000000
      7.660000
      ...
      -53.530000
      -32.950000
      -8.800000
      -11.210000
      -40.370000
      -3.270000
      -8.730000
      -34.140000
      -8.870000
      -10.830000
    
    
      25%
      54.000000
      28.000000
      52.000000
      92.250000
      49.000000
      92.000000
      80.000000
      24.000000
      55.000000
      40.667500
      ...
      -6.627500
      -23.325000
      -1.860000
      -5.790000
      -24.090000
      -1.290000
      -5.747500
      -22.237500
      -2.370000
      -5.122500
    
    
      50%
      60.000000
      31.500000
      57.000000
      99.500000
      55.000000
      98.000000
      91.000000
      25.000000
      58.000000
      53.030000
      ...
      -2.255000
      -20.020000
      -0.970000
      -4.350000
      -20.465000
      -0.945000
      -4.540000
      -19.200000
      -1.420000
      -4.125000
    
    
      75%
      70.750000
      50.750000
      69.000000
      111.750000
      65.000000
      107.000000
      101.000000
      27.000000
      63.000000
      59.920000
      ...
      0.247500
      -17.787500
      -0.042500
      -2.882500
      -17.955000
      -0.642500
      -3.617500
      -16.227500
      -0.655000
      -3.105000
    
    
      max
      105.000000
      160.000000
      196.000000
      172.000000
      98.000000
      136.000000
      139.000000
      82.000000
      109.000000
      83.320000
      ...
      5.740000
      5.130000
      12.460000
      7.370000
      1.880000
      3.440000
      3.940000
      3.670000
      8.840000
      7.790000

But what do these values mean? The dataset description tells us:

Attribute Information:

Class: ‘s’ (‘Sugi’ forest), ‘h’ (‘Hinoki’ forest), ‘d’ (‘Mixed deciduous’ forest), ‘o’ (‘Other’ non-forest land)

b1 - b9: ASTER image bands containing spectral information in the green, red, and near infrared wavelengths for three dates (Sept. 26, 2010; March 19, 2011; May 08, 2011.

pred_minus_obs_S_b1 - pred_minus_obs_S_b9: Predicted spectral values (based on spatial interpolation) minus actual spectral values for the ‘s’ class (b1-b9).

pred_minus_obs_H_b1 - pred_minus_obs_H_b9: Predicted spectral values (based on spatial interpolation) minus actual spectral values for the ‘h’ class (b1-b9).

– dataset description on the UCI ML repo

Let’s see if we can derive anything from box plots:

input_data_train.plot.box(figsize=(12,7), xticks=[])
plt.title('Boxplots of all features')
plt.xlabel('Feature')
plt.ylabel('')
plt.show()

Next, we’ll have a look at class distributions in both sets:

plt.figure(figsize=(7,5))
plt.bar(list(y_hist_train_dict.keys()), y_hist_train_dict.values(), color="black")
plt.title("Forest type histogram - train set")
plt.ylabel("Count")
plt.xlabel("Forest type")
plt.tight_layout()
plt.show()

That looks kind of equally distributed. Let’s look at the test set:

plt.figure(figsize=(7,5))
plt.bar(list(y_hist_test_dict.keys()), y_hist_test_dict.values(), color="black")
plt.title("Forest type histogram - test set")
plt.ylabel("Count")
plt.xlabel("Forest type")
plt.tight_layout()
plt.show()

Well. that doesn’t look so great. It’s time to scale the data and see if we can identify some distinct features visually.

#categorical encoding
input_data_train['class'] = pd.Categorical(input_data_train['class']).codes
input_data_test['class'] = pd.Categorical(input_data_test['class']).codes

# split data into X and y
y_train = input_data_train['class'].copy(deep=True)
X_train = input_data_train.copy(deep=True)
X_train.drop(['class'], inplace=True, axis=1)

y_test = input_data_test['class'].copy(deep=True)
X_test = input_data_test.copy(deep=True)
X_test.drop(['class'], inplace=True, axis=1)

from sklearn.preprocessing import MaxAbsScaler



X_train = X_train.astype('float64')
X_test = X_test.astype('float64')


scaler = MaxAbsScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

plt.figure(figsize=(13,8))
plt.boxplot(X_train,meanline=False, notch=True)
plt.title('Boxplots of all features - scaled')
plt.xlabel('Feature')
plt.ylabel('')
plt.show()

plt.figure(figsize=(11,9))
for cl in input_data_train["class"].unique():
    plt.plot(input_data_train[input_data_train["class"] == cl].values[0], label=cl)
plt.title("Examples for each forest type - training set (unscaled))")
plt.legend()
plt.show()

plt.figure(figsize=(11,9))
for cl in input_data_train["class"].unique():
    plt.plot(X_train[input_data_train["class"] == cl][0], label=cl)
plt.title("Examples for each forest type - training set (scaled))")
plt.legend()
plt.show()

Let’s throw a few machine learning algorithms at it. *The neural networks are similar to the ones from the concrete compressive strength dataset.

It looks like Mondrian forests perform bests.

	class	b1	b2	b3	b4	b5	b6	b7	b8	b9	...	pred_minus_obs_H_b9	pred_minus_obs_S_b1	pred_minus_obs_S_b2	pred_minus_obs_S_b3	pred_minus_obs_S_b4	pred_minus_obs_S_b5	pred_minus_obs_S_b6	pred_minus_obs_S_b7	pred_minus_obs_S_b8	pred_minus_obs_S_b9
0	d	39	36	57	91	59	101	93	27	60	...	-2.36	-18.41	-1.88	-6.43	-21.03	-1.60	-6.18	-22.50	-5.20	-7.86
1	h	84	30	57	112	51	98	92	26	62	...	-2.26	-16.27	-1.95	-6.25	-18.79	-1.99	-6.18	-23.41	-8.87	-10.83

	class	b1	b2	b3	b4	b5	b6	b7	b8	b9	...	pred_minus_obs_H_b9	pred_minus_obs_S_b1	pred_minus_obs_S_b2	pred_minus_obs_S_b3	pred_minus_obs_S_b4	pred_minus_obs_S_b5	pred_minus_obs_S_b6	pred_minus_obs_S_b7	pred_minus_obs_S_b8	pred_minus_obs_S_b9
0	d	67	51	68	115	69	111	136	31	67	...	-9.17	-18.27	-1.80	-6.32	-20.88	-1.63	-6.13	-22.56	-5.53	-8.11
1	s	67	28	51	99	50	97	82	26	59	...	-2.25	-20.13	-2.11	-6.35	-21.94	-1.22	-6.13	-22.20	-3.41	-6.57

	b1	b2	b3	b4	b5	b6	b7	b8	b9	pred_minus_obs_H_b1	...	pred_minus_obs_H_b9	pred_minus_obs_S_b1	pred_minus_obs_S_b2	pred_minus_obs_S_b3	pred_minus_obs_S_b4	pred_minus_obs_S_b5	pred_minus_obs_S_b6	pred_minus_obs_S_b7	pred_minus_obs_S_b8	pred_minus_obs_S_b9
count	198.000000	198.000000	198.000000	198.000000	198.000000	198.000000	198.000000	198.000000	198.000000	198.000000	...	198.000000	198.000000	198.000000	198.000000	198.000000	198.000000	198.000000	198.000000	198.000000	198.000000
mean	62.949495	41.020202	63.676768	101.409091	58.732323	100.651515	90.601010	28.691919	61.116162	50.818889	...	-5.594141	-20.037576	-1.007121	-4.355657	-20.996919	-0.973737	-4.597626	-18.840000	-1.570808	-4.155859
std	12.779563	17.832543	17.314545	14.804627	12.392648	11.190314	15.588861	8.977752	9.787158	12.842321	...	9.769193	4.948562	1.783671	2.352311	6.490763	0.702619	1.736712	5.251095	1.807792	1.982423
min	34.000000	25.000000	47.000000	54.000000	44.000000	84.000000	54.000000	21.000000	50.000000	7.660000	...	-53.530000	-32.950000	-8.800000	-11.210000	-40.370000	-3.270000	-8.730000	-34.140000	-8.870000	-10.830000
25%	54.000000	28.000000	52.000000	92.250000	49.000000	92.000000	80.000000	24.000000	55.000000	40.667500	...	-6.627500	-23.325000	-1.860000	-5.790000	-24.090000	-1.290000	-5.747500	-22.237500	-2.370000	-5.122500
50%	60.000000	31.500000	57.000000	99.500000	55.000000	98.000000	91.000000	25.000000	58.000000	53.030000	...	-2.255000	-20.020000	-0.970000	-4.350000	-20.465000	-0.945000	-4.540000	-19.200000	-1.420000	-4.125000
75%	70.750000	50.750000	69.000000	111.750000	65.000000	107.000000	101.000000	27.000000	63.000000	59.920000	...	0.247500	-17.787500	-0.042500	-2.882500	-17.955000	-0.642500	-3.617500	-16.227500	-0.655000	-3.105000
max	105.000000	160.000000	196.000000	172.000000	98.000000	136.000000	139.000000	82.000000	109.000000	83.320000	...	5.740000	5.130000	12.460000	7.370000	1.880000	3.440000	3.940000	3.670000	8.840000	7.790000