Chapter 9: Introduction to Machine Learning


The Python code was developed based on the R code written by
Samuel Shen, Distinguished Professor, San Diego State University, California, USA

Kaelia Okamura, Thomas Bui, Liu Yang, and Danielle Lafarga contributed to the Python code development.
Kaelia Okamura composed and finalized the code.



In [1]:
import numpy as np
import pandas as pd
import math as m
import matplotlib
import statistics as stats
import scipy.stats as scistats
import matplotlib.pyplot as plt
import matplotlib.colors as color
import os
import sys
import math
import statsmodels.api as sm
#import shapely
import statistics
import random
import warnings
warnings.filterwarnings("ignore")

from sklearn.cluster import KMeans
from sklearn.tree import plot_tree
from sklearn import datasets
from sklearn import svm
from scipy.spatial import ConvexHull
from sympy import symbols, diff
from scipy.optimize import fsolve
from matplotlib import cm as cm1
from scipy import optimize as opt
from pandas import read_table
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.patches import Polygon
from scipy.ndimage import gaussian_filter
from sklearn.linear_model import LinearRegression
In [2]:
# Style Dictionary to standarize plotting scheme 
# between different python scripts 
styledict = {'xtick.labelsize':21,
             'xtick.major.size':9,
             'xtick.major.width':1,
             'ytick.labelsize':21,
             'ytick.major.size':9,
             'ytick.major.width':1,
             'legend.framealpha':0.0,
             'legend.fontsize':15,
             'axes.labelsize':22,
             'axes.titlesize':22,
             'axes.linewidth':2,
             'figure.figsize':(12,8),
             'savefig.format':'jpg'}
plt.rcParams.update(**styledict)
In [3]:
# set your working directory
os.chdir("/Users/sshen/climstats")

tWCSS calculation for N = 3 and K = 2¶

In [4]:
N = 3
# create a 3 x 2 matrix of data
mydata = np.array([1,1,2,1,3,3.5]).reshape(3,2)

# first row of data
x1 = mydata[0,:]
# second row of data
x2 = mydata[1,:]
# third row of data
x3 = mydata[2,:]
In [5]:
# case C1 = (P1, P2)
c1 = (x1 + x2)/2
c2 = mydata[2,:]

tWCSS = np.linalg.norm(x1 - c1)**2 +  np.linalg.norm(x2 - c1)**2 +\
        np.linalg.norm(x3 - c2)**2
print(tWCSS)

# case C1 = (P1, P3)
c1 = (x1 + x3)/2
c2 = mydata[1,:]

print(np.linalg.norm(x1 - c1)**2 +  np.linalg.norm(x3 - c1)**2 + \
      np.linalg.norm(x2 - c2)**2)

# case C1 = (P2, P3)
c1 = (x2 + x3)/2
c2 = mydata[0,:]

print(np.linalg.norm(x2 - c1)**2 +  np.linalg.norm(x3 - c1)**2 + \
      np.linalg.norm(x1 - c2)**2)
0.5
5.125
3.6249999999999996
In [6]:
# case C1 = (P1,P2) can be quickly found by 
kmeans = KMeans(n_clusters=2).fit(mydata)

# note the difference in index
print(kmeans.labels_)
[0 0 1]

Figure 9.1: K-means for N = 3 and K = 2¶

In [7]:
#Python code: K-means computing for N = 3 and K = 2
N = 3
# create a 3 x 2 matrix of data
mydata = np.array([1,1,2,1,3,3.5]).reshape(3,2)

# first row of data
x1 = mydata[0,:]
# second row of data
x2 = mydata[1,:]
# third row of data
x3 = mydata[2,:]

# case C1 = (P1, P2)
c1 = (x1 + x2)/2
c2 = mydata[2,:]
tWCSS = np.linalg.norm(x1 - c1)**2 +  np.linalg.norm(x2 - c1)**2 +\
        np.linalg.norm(x3 - c2)**2
print(tWCSS)
#0.5

# case C1 = (P1, P3)
c1 = (x1 + x3)/2
c2 = mydata[1,:]
print(np.linalg.norm(x1 - c1)**2 +  np.linalg.norm(x3 - c1)**2 + \
      np.linalg.norm(x2 - c2)**2)

# case C1 = (P2, P3)
c1 = (x2 + x3)/2
c2 = mydata[0,:]
print(np.linalg.norm(x2 - c1)**2 +  np.linalg.norm(x3 - c1)**2 + \
      np.linalg.norm(x1 - c2)**2)
#5.125
      
# case C1 = (P1, P2) can be quickly found by 
kmeans = KMeans(n_clusters=2).fit(mydata)

print(kmeans.labels_)
#[0 0 1] #points P1, P2 in C1 
#because Python index begins with 0 while R starts from 1

# number of data points
N = 3
# assume K clusters
K = 2
# create a 3 x 2 matrix of data
mydata =  np.array([1, 1, 2, 1, 3, 3.5]).reshape(3,2)

Kclusters = KMeans(n_clusters=K).fit(mydata)
# cluster means
print(Kclusters.cluster_centers_, "\n")
#[[1.5 1. ]
#[3.  3.5]] 
# sum of squares by cluster
print(Kclusters.inertia_)
#0.5
0.5
5.125
3.6249999999999996
[1 1 0]
[[3.  3.5]
 [1.5 1. ]] 

0.5
In [8]:
#Python  Fig. 9.1: SVM clustering for N=3 and K=2
#starting from setting up the plot
plt.figure(figsize = (8,8))
plt.ylim([0,4])
plt.yticks([0,1,2,3,4])
plt.xlim([0,4])
plt.xticks([0,1,2,3,4])
plt.title("K-means clustering for \n \
  three points and two clusters", pad = 10)
plt.xlabel("x", labelpad = 10)
plt.ylabel("y", labelpad = 10)

# plot P1-P3
plt.scatter(mydata[:,0], mydata[:,1], 
            color = ('r', 'r', 'dodgerblue'), 
            facecolors='none', s = 80)
# plot C1 and C2
plt.scatter((Kclusters.cluster_centers_[0][0], 
                Kclusters.cluster_centers_[1][0]),
               (Kclusters.cluster_centers_[0][1],
               Kclusters.cluster_centers_[1][1]),
              marker = 'X', color = ('dodgerblue', 'r'))

# add labels
plt.text(1.43, 0.8, "$C_1$", color = 'r', size = 14)
plt.text(3.1, 3.45, "$C_2$", color = 'dodgerblue', size =14)
plt.text(0.95,1.1, "$P_1$", color = 'r', size = 14)
plt.text(1.95,1.1, "$P_2$", color = 'r', size = 14)
plt.text(2.95,3.3, "$P_3$", color = 'dodgerblue', size = 14)

plt.show()

Figure 9.2: K-means clustering for 2001 daily weather data at Miami International Airport, Station ID USW00012839¶

In [9]:
# load data for Fig. 9.2(a)
dat = pd.read_csv('data/MiamiIntlAirport2001_2020.csv')
# return dimension of data
print(dat.shape)

# TMIN column subset 
tmin = dat.loc[:,'TMIN']
# WDF2 column subset
wdf2 = dat.loc[:,'WDF2']
# set up plot
plt.yticks([0,50,100,200,300])
plt.title("(a) 2001 Daily Miami Tmin vs WDF2", pad = 10)
plt.xlabel(r"Tmin [$\degree$C]", labelpad = 10)
plt.ylabel(r"Wind Direction [$\degree$]", labelpad = 10)

# plot data and note the index change
plt.scatter(tmin[1:366], wdf2[1:366], color = 'k')

plt.show()
(7305, 29)
In [10]:
# K-means clustering for Fig. 9.2(b)
K = 2
# combine data into two column df
mydata = pd.concat([tmin[1:366],wdf2[1:366]], axis = 1)

# K-means clustering
fit = KMeans(n_clusters=K).fit(mydata)

# cluster center coordinates
print(fit.cluster_centers_, '\n')#enters of the two clusters
print(fit.inertia_) # total WCSS

#kmeans = KMeans(n_clusters=K, random_state=0)
mydata['cluster'] = \
kmeans.fit_predict(mydata[['TMIN', 'WDF2']])
colors = ['black', 'red']
mydata['color'] = \
mydata.cluster.map({0:colors[0], 1:colors[1]})

kmeans = KMeans(n_clusters=K, random_state=0)
mydata['cluster'] = \
kmeans.fit_predict(mydata[['TMIN', 'WDF2']])

colors = ['black', 'red']
mydata['color'] = \
mydata.cluster.map({0:colors[0], 1:colors[1]})

x1 = fit.cluster_centers_[0][0] #Cluster center
x2 = fit.cluster_centers_[1][0]
y1 = fit.cluster_centers_[0][1]
y2 = fit.cluster_centers_[1][1]

plt.title("(b) K-means for Tmin vs WDF2", pad = 10)
plt.xlabel(r"Tmin [$\degree$C]", labelpad = 10)
plt.ylabel(r"Wind Direction [$\degree$]", labelpad = 10)
plt.yticks([0,50,100,200,300])
# plot points
plt.scatter(mydata['TMIN'], mydata['WDF2'], 
             s = 10, color = mydata['color']) #s is size

for i in mydata.cluster.unique():
    points = \
    mydata[mydata.cluster == i][['TMIN', 'WDF2']].values
    # get convex hull
    hull = ConvexHull(points)
    # get x and y coordinates
    # repeat last point to close the polygon
    x_hull = np.append(points[hull.vertices,0],
                       points[hull.vertices,0][0])
    y_hull = np.append(points[hull.vertices,1],
                       points[hull.vertices,1][0])
    # plot shape
    plt.fill(x_hull, y_hull, alpha=0.2, c=colors[i])
    # plot centers   
plt.scatter([x1,x2],[y1,y2], color = ['red', 'black'], 
            linewidth = 2, facecolors= 'none', s = 700)
plt.show()
[[ 21.93356643 103.91608392]
 [ 18.38607595 278.86075949]] 

457844.8668141984

FIgure 9.3: tWCSS(K) and pWCSS(K)¶

In [11]:
#Plot Fig. 9.3(a): Elbow principle
twcss = np.zeros(9)
for K in range(1,9):
    mydata = pd.concat([tmin[1:366],wdf2[1:366]], axis = 1)
    twcss[K] = KMeans(n_clusters=K).fit(mydata).inertia_
# remove K = 0 value from twcss
twcss = twcss[1:]
# plot elbow
plt.plot(np.linspace(1,8,8), twcss/100000, 'k', linewidth=2)
# plot points
plt.scatter([np.linspace(1,8,8)], [twcss/100000], color='k')

# plot elbow at K = 2
plt.scatter(2,twcss[1]/100000, color = 'blue', s = 500)
# add text
plt.text(2.2, 5.2, 'Elbow at K = 2', color='blue', size=20)
# add labels
plt.title("(a) The elbow principle from tWCSS scores", 
          pad = 10)
plt.xlabel("K", labelpad = 10)
plt.ylabel(r"tWCSS [x $10^5$]", labelpad = 10)
plt.show()
In [12]:
#Plot Fig. 9.3(b): Knee principle
# compute percentage of variance explained
pWCSS = 100*(twcss[0]-twcss)/twcss[0]
# plot knee
plt.plot(np.linspace(1,8,8),pWCSS, 'k', linewidth = 2)
# add points
plt.scatter(np.linspace(1,8,8),pWCSS, color = 'k')

# plot knee at K = 2
plt.scatter(2,pWCSS[1], color = 'blue', s = 500)
# add text
plt.text(2.2, 76, 'Knee at K = 2', color = 'blue', 
         size = 20)
# add labels
plt.title("(b) The knee princple from pWCSS variance", 
          pad = 10)
plt.xlabel("K", labelpad = 10)
plt.ylabel("pWCSS [%]", labelpad = 10)
plt.show()
In [13]:
#Plot
# compute percentage of variance explained
pWCSS = 100*(twcss[0]-twcss)/twcss[0]
# plot knee
plt.plot(np.linspace(1,8,8),pWCSS, 'k', linewidth = 2)
# add points
plt.scatter(np.linspace(1,8,8),pWCSS, color = 'k')

# plot knee at K = 2
plt.scatter(2,pWCSS[1], color = 'blue', s = 500)
# add text
plt.text(2.2, 76, 'Knee at K = 2', color = 'blue', size = 20)

# add labels
plt.title("(b) The knee princple from pWCSS variance", pad = 10)
plt.xlabel("K", labelpad = 10)
plt.ylabel("pWCSS [%]", labelpad = 10)

plt.show()

plt.show()

Figure 9.4: Convex hull for a cluster¶

In [14]:
K = 2
clusterK = KMeans(n_clusters=K).fit(mydata)
mydata['cluster'] = \
       kmeans.fit_predict(mydata[['TMIN', 'WDF2']])
mydata['day']= np.arange(1, 366)
subset = mydata[mydata['cluster']==0]
# set up plot
plt.title("Cluster 1 of Miami Tmin vs WDF2", pad = 10)
plt.xlabel(r"Tmin [$\degree$C]", labelpad = 10)
plt.ylabel(r"WDF2 [$\degree$]", labelpad = 10)

plt.xlim([0,30])
plt.ylim([0,400])
plt.yticks([0,100,200,300])

# plot points
plt.scatter(subset["TMIN"],subset["WDF2"], color = 'k')
# add labels
for i in range(len(subset["TMIN"])):
    plt.text(subset["TMIN"].values[i], 
             subset["WDF2"].values[i], 
             subset["day"].values[i], size=12)  
# add boundary
for i in subset.cluster.unique():
    points = \
      subset[subset.cluster == i][['TMIN', 'WDF2']].values
    # get convex hull
    hull = ConvexHull(points)
    # get x and y coordinates
    # repeat last point to close the polygon
    x_hull = np.append(points[hull.vertices,0],
                       points[hull.vertices,0][0])
    y_hull = np.append(points[hull.vertices,1],
                       points[hull.vertices,1][0])
    # plots hape
    plt.fill(x_hull, y_hull, alpha=0.2, c=colors[i])

Figure 9.5: Maximum difference between two points¶

In [15]:
# define the two points
x = np.array([1,1,3,3]).reshape(2,2)
# define the two labels
y = (-1,1)
# set up plot
plt.figure(figsize = (10,10))
plt.title("Maximum Difference Between Two Points", pad = 10)
plt.xlabel("$x_1$", labelpad = 10)
plt.ylabel("$x_2$", labelpad = 10)
plt.xlim(-2.2,6.2)
plt.xticks([-2,-1,0,1,2,3,4,5,6],[-2,'',0,'',2,'',4,'',6])
plt.ylim(-2.2,6.2)
plt.yticks([-2,-1,0,1,2,3,4,5,6],[-2,'',0,'',2,'',4,'',6])
plt.grid()
# plot points
plt.scatter(x[0],x[0], color = 'r')
plt.scatter(x[1],x[1], color = 'dodgerblue')
# connect points
plt.plot([1,3],[1,3], 'k')
# add dashed lines
plt.plot([0,6],[6,0], color ='dodgerblue', linestyle ='--')
plt.plot([-2,6],[6,-2], color = 'purple', linestyle = '--')
plt.plot([-2,4],[4,-2], 'r', linestyle = '--')
# plot arrows
plt.arrow(1,1,0.5,0.5, head_width = 0.12, color='k') 
plt.arrow(4,0,0.3,0.3, head_width = 0.12, color='limegreen')
# add text
plt.text(0.5,4.3, "Positive Hyperplane", rotation = -45, 
         size = 14, color = 'dodgerblue')
plt.text(3.2,3.2, "$P_2$", size = 15, color = 'dodgerblue')
plt.text(-1.2,3.8, "Separating Hyperplane", rotation = -45, 
         size = 14, color = 'purple')
plt.text(4.4,0.1, "w", size = 15, color = 'limegreen')
plt.text(1.5,1.2, "n", size = 15)
plt.text(-2,2.7, "Negative Hyperplane", rotation = -45, 
         size = 14, color = 'r')
plt.text(0.7, 0.6, "$P_1$", size = 15, color = 'r')

plt.show()

Figure 9.6: SVM for three points¶

In [16]:
# define x for ease of plotting
x = np.array([1,1,2,1,3,3.5])
# redefine x for svm
X = [[1, 1],[2,1],[3,3.5]]
y = [1, 1, -1]
# define SVM
svm3P = svm.SVC(kernel='linear')
# train SVM
svm3P.fit(X, y)
# predict new data using the trained SVM
print('The prediction is: ',
      svm3P.predict([[0.5, 2.5],[4.5,4]]), '\n')
# find hyperplane, normal vector, and SV (wx + b = 0)
w = svm3P.coef_[0]
print('w is: ',w, '\n')
b = -svm3P.intercept_ #intercept
print('b is: ', b, '\n')
# find the maximum margin of separation
print(2/np.linalg.norm(w))
x1 = np.linspace(0,5,5)
x2 = (b - w[0]*x1)/w[1]
x2p = (1 + b - w[0]*x1)/w[1]
x2m = (-1 + b - w[0]*x1)/w[1]
x20 = (b - w[0]*x1)/w[1]
# set up plot
plt.figure(figsize = (10,10))
plt.title("SVM for three points labeled in two categories", 
          pad = 10)
plt.xlabel("$x_1$", labelpad = 10)
plt.ylabel("$x_2$", labelpad = 10)
plt.xlim(-0.2,5.2)
plt.ylim(-0.2,6.2)
plt.grid()
# plot original points
plt.scatter([x[::2]],[x[1::2]],
            color=['dodgerblue','dodgerblue','red'])
# add lines
plt.plot(x1,x2p, color = 'dodgerblue', linestyle = '--')
plt.plot(x1, x2m, color = 'red', linestyle = '--')
plt.plot(x1, x20, color = 'purple')
# plot diamonds
plt.scatter([0.5,4.5], [2.5,4], color = 'k', marker = 'D')
# add text
plt.text(0.65,2.45,"$Q_1$", size = 16, color = 'dodgerblue')
plt.text(4.65,3.95, "$Q_2$", size = 16, color = 'r')
plt.text(0.3,5.4, "Two blue points and a red point \n \
are training data for an SVM",
        size = 16, color = 'dodgerblue')
plt.text(1.7,4.5, "Two black diamond points \n \
         are to be predicted by the SVM",
        size = 16, color = 'k')
plt.show()
The prediction is:  [ 1 -1] 

w is:  [-0.27586207 -0.68965517] 

b is:  [-2.24137931] 

2.6925824035672523

Figure 9.7: SVM for many points¶

In [17]:
# define x for plotting
x = np.array([1,6,2,8,3,7.5,1,8,
              4,9,5,9,3,7,5,9,1,5,
              5,3,6,4,7,4,8,6,9,5,
              10,6,5,0,6,5,8,2,2,2,1,1])
# define y
y = [1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2]
# redefine x for SVM
X = [[1,6],[2,8],[3,7.5],[1,8],
              [4,9],[5,9],[3,7],[5,9],[1,5],
              [5,3],[6,4],[7,4],[8,6],[9,5],
              [10,6],[5,0],[6,5],[8,2],[2,2],[1,1]]

svmP = svm.SVC(kernel='linear')
# train SVM
svmP.fit(X, y)

# predict new data using trained SVM
print('The support vectors are: ',
      svmP.support_vectors_, '\n')

# find hyperplane, normal vector, and SV (wx + b = 0)
w = -svmP.coef_[0]
print('w is: ',w, '\n')

# rho is the negative intercept in R 
# intercept is the positive intercept in Python
b = svmP.intercept_
print('b is: ',b, '\n')

# find the maximum margin of separation
print(2/np.linalg.norm(w))
x1 = np.linspace(0,10,31)
x2 = (b - w[0]*x1)/w[1]
x2p = (1 + b - w[0]*x1)/w[1]
x2m = (-1 + b - w[0]*x1)/w[1]
x20 = (b - w[0]*x1)/w[1]

thetasvm = math.atan(-w[0]/w[1])*180/np.pi
# degree angle of the hyperplane
print(thetasvm)

delx = 1.4
dely = delx * (-w[0]/w[1])

# new predicted data points
svmP.predict([[0.5,2.5],[7,2],[6,9]])
The support vectors are:  [[1. 5.]
 [6. 5.]
 [2. 2.]] 

w is:  [-0.39996  0.53328] 

b is:  [1.26657333] 

3.0003000300029994
36.86989764584401
Out[17]:
array([2, 2, 1])
In [18]:
#The plotting part of  Fig. 9.7
plt.figure(figsize = (10,10))#set up plot
plt.xlim(-0.2,10.2)
plt.ylim(-0.2,10.2)
plt.title("SVM application to many points of two labels", 
          pad = 10)
plt.xlabel("$x_1$", labelpad = 10)
plt.ylabel("$x_2$", labelpad = 10)

# plot points
color = ['r','r','r','r','r','r','r','r','r','forestgreen',
         'forestgreen','forestgreen','forestgreen',
  'forestgreen','forestgreen','forestgreen','forestgreen',
         'forestgreen','forestgreen','forestgreen',]
plt.scatter(x[::2],x[1::2], color = color)
# plot newly predicted points
plt.scatter([0.5,7,6],[2.5,2,9], marker = "^", 
            color = 'k', s = 90)
# plot lines
plt.plot(x1,x2p, color = 'red', linestyle = '--')
plt.plot(x1,x2m, color = 'forestgreen', linestyle = '--')
plt.plot(x1,x20,color = 'purple')

# add text
plt.text(5+2*delx,6.5+2*dely,'$w \cdot x - b = 0$', 
         color = 'purple', rotation = thetasvm, size = 15)
plt.text(5-delx, 7.6-dely, '$w \cdot x - b = 1$', 
         color = 'red', rotation = thetasvm, size = 15)
plt.text(5, 4.8,'$w \cdot x - b = -1$', 
    color = 'forestgreen', rotation = thetasvm, size = 15)
plt.text(1.8,4.3,'w', color = 'blue', size = 15)
plt.text(0.3,2.1,'$Q_1$', color = 'k', size = 15)
plt.text(6.8,1.6,'$Q_2$', color = 'k', size = 15)
plt.text(5.8,8.6,'$Q_3$', color = 'k', size = 15)

# add normal direction of the hyperplanes
plt.arrow(2,3.86,w[0],w[1],color = 'blue', head_width = 0.1)
plt.show()

Figure 9.8: R.A. Fisher data of three iris species¶

In [19]:
iris = datasets.load_iris()
# notice "target" is equivalent to "species"
iris = pd.DataFrame(data=np.c_[iris['data'],iris['target']],
                columns= iris['feature_names'] + ['target'])

# check the structure of the data
print(iris.info(), '\n')

# check the first two rows of the data
iris.iloc[0:2,]

# set up plot
plt.title("R. A. Fisher data of iris flowers", pad = 10)
plt.xlabel("Sorted order of the flowers for measurement", 
           labelpad = 10)
plt.ylabel("Length or width [cm]", labelpad = 10)
plt.xticks([0,50,100,150])
plt.ylim(-2,9)
plt.yticks([0,2,4,6,8])

# plot the data of 150 observations
x = np.linspace(0,149,150)
plt.plot(x,iris.iloc[:,0],'ko-', label = 'Sepal length')
plt.plot(x,iris.iloc[:,1],'ro-', label = 'Sepal width')
plt.plot(x,iris.iloc[:,2],'go-', label = 'Petal length')
plt.plot(x,iris.iloc[:,3],'bo-', label = 'Petal width')

# add legend
plt.legend()

# add text
plt.text(13,-1, "Setosa 1-50", size = 15)
plt.text(57,-1, "Versicolor 51-100", size = 15)
plt.text(107,-1, "Virginica 101-150", size = 15)

plt.show()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   target             150 non-null    float64
dtypes: float64(5)
memory usage: 6.0 KB
None 

Fig. 9.9: RF prediction with the Fisher iris data¶

In [20]:
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import metrics
# randomly select 120 observations as training data
train_id = random.sample(range(0, 150), 120)
train_data = iris.iloc[train_id,:]
print(train_data.shape)# dimensions of training data
# use the remaining 30 as the new data for prediction
new_data = iris.drop(train_id)
print(new_data.shape)# dimensions of new data
# train the RF model
classifyRF = RandomForestClassifier(n_estimators=800, 
                                    oob_score=True)
training = classifyRF.fit(train_data.iloc[:,:4], 
               train_data.iloc[:,4])
predictions = classifyRF.predict(new_data.iloc[:,:4])
print(predictions)
#[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 2. 
#1. 2. 1. 1. 2. 1. 1. 2. 2. 2. 2.
# 2. 2. 2. 2. 2. 2.] 
#0 = setosa, 1 = versicolor, 2 = virginica
confusion_M = confusion_matrix(new_data.iloc[:,4], 
                               predictions)
print(confusion_M)
#[[ 9  0  0] #all correct predictions for setosa
# [ 0  8  3]  #3 wrong predictions for versicolor
# [ 0  0 10]] #all correct predictions for virginica
accuracy_score(new_data.iloc[:,4], 
                               predictions)
#0.9
(120, 5)
(30, 5)
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 2. 2. 2.
 2. 2. 2. 2. 2. 2.]
[[13  0  0]
 [ 0  8  0]
 [ 0  0  9]]
Out[20]:
1.0
In [21]:
#Python lot Fig. 9.9(a): RF mean accuracy 
n_estimators = 100
classifyRF = RandomForestClassifier(n_estimators, 
                                    oob_score=True)
RFscore = []
for i in range(1, n_estimators + 1):
    classifyRF.set_params(n_estimators=i)
    classifyRF.fit(train_data.iloc[:,:4], 
               train_data.iloc[:,4])
    RFscore.append(classifyRF.score(train_data.iloc[:,:4], 
               train_data.iloc[:,4]))  
plt.plot(RFscore)
plt.title("Random forest score of the mean accuracy")
plt.xlabel("Number of estimators")
plt.ylabel("OOB score")
plt.show()
In [22]:
#Python plot Fig. 9.9(b): RF importance plot
# Python shows the importance results differently from R 
plt.plot(np.linspace(1,4,4),
         classifyRF.feature_importances_, 
         'ko', markersize = 15)
plt.title("Importance plot of RF model", pad = 10)
plt.ylabel("Mean decrease in impurity", labelpad = 12)
plt.xticks([1,2,3,4], ['Sepal.Length','Sepal.Width', 
     'Petal.Length', 'Petal.Width'], rotation = 90)
plt.yticks(classifyRF.feature_importances_, 
           ['0.01','0.11','0.39','0.48'])
plt.grid()
plt.show()

Figure 9.10: RF regression for ozone data¶

In [23]:
from sklearn.ensemble import RandomForestRegressor
# Read in the airquality data and create a copy
airquality = pd.read_csv("data/airquality.csv", 
                         index_col = 'Unnamed: 0')
airquality_copy = airquality.copy() 
print(airquality.head())#print the first 5 rows of data
print(np.shape(airquality))#data matrix dim (153, 6)
# Create list of integer from 1 to 153
t0 = list(np.linspace(1,153,153, dtype=int)) 
# positions of recorded data
n1 = np.where(airquality["Ozone"] > 0)
n1 = n1[0].tolist()
n1 = [x+1 for x in n1]
# positions of unknown 'NaN' data
n0=[]
for x in t0:
    if x not in n1:
        n0.append(x)       
# Replace NA with medians in order to train the RF trees
airquality['Solar.R'].fillna(
    value=airquality['Solar.R'].median(), inplace=True)
airquality['Wind'].fillna(
    value=airquality['Wind'].median(), inplace=True)
airquality['Temp'].fillna(
    value=airquality['Temp'].median(), inplace=True)
airquality['Day'].fillna(
    value=airquality['Day'].median(), inplace=True)
airquality['Month'].fillna(
    value=airquality['Month'].median(), inplace=True)
airquality['Ozone'].fillna(
    value=airquality['Ozone'].median(), inplace=True)
# Create or features X and our target y
X = airquality[['Solar.R', 'Wind', 'Temp', 'Month', 'Day']]
y = airquality[['Ozone']]
# split our data into training and test sets 
X_train = X.loc[n1,:]
X_test = X.loc[n0,:]
y_train = y.loc[n1,:]
y_test = y.loc[n0,:]
#create the RF Regressor model with 500 esitmators
ozoneRFreg = RandomForestRegressor(n_estimators=500, 
                   oob_score=True, max_features = 2)
# fit model to our training set
ozoneRFreg.fit(X_train, y_train.values.ravel())
#create our prediction
ozonePrediction = ozoneRFreg.predict(X_test)
#create data frame with our results
result = X_test
result['Ozone'] = y_test
result['Prediction'] = ozonePrediction.tolist()
# create data frame with our predictions
ozone_filled = pd.DataFrame(None, index = np.arange(153), 
                            columns = ["Prediction"] )
ozone_filled.loc[n0, 'Prediction'] = result["Prediction"]
ozone_filled.index += 1

#Plot Fig. 9. 10(a)
t1 = np.linspace(5,10,153)
mfrow = [2,1]
mar = [3, 4.5, 2, 0.1]
# original known data
ar = pd.read_csv("data/airquality.csv", 
                         index_col = 'Unnamed: 0')
plt.plot(t1, ar["Ozone"], 
         'ko-', markersize = 3) 
plt.ylim(0, 170)
#unknown Predicted data
plt.plot(t1, ozone_filled['Prediction'], 
         'bo-', markersize = 3)
plt.ylabel("Ozone [ppb]", labelpad = 12)
MaySept = ["May","Jun", "Jul", "Aug", "Sep"]
plt.xticks([5,6,7,8,9], MaySept)
plt.title("(a) Observed (black) and RF filled (blue)")
plt.show()
   Ozone  Solar.R  Wind  Temp  Month  Day
1   41.0    190.0   7.4    67      5    1
2   36.0    118.0   8.0    72      5    2
3   12.0    149.0  12.6    74      5    3
4   18.0    313.0  11.5    62      5    4
5    NaN      NaN  14.3    56      5    5
(153, 6)
In [24]:
#Plot Fig. 9.10(b)
# combine unknown and known data into one dataframe
result_copy = result.copy()
result_copy = result_copy.rename(
    columns = {'Ozone':'y_tested', 'Prediction':'Ozone'})
ozone_complete = airquality_copy[
    "Ozone"].fillna(result_copy["Ozone"])
#plot the combined data
t1 = np.linspace(5,10,153)
mfrow = [2,1]
mar = [3, 4.5, 2, 0.1]
plt.plot(t1, ozone_complete, 'ro-', markersize = 3)
plt.ylim(0, 170)
plt.ylabel("Ozone [ppb]", labelpad = 12)
MaySept = ["May","Jun", "Jul", "Aug", "Sep"]
plt.xticks([5,6,7,8,9], MaySept)
plt.title("(b) RF-filled complete ozone data series")
plt.show()

Figure 9.11: A tree in a random forest¶

In [25]:
classifyRF = RandomForestClassifier(n_estimators=1, 
                                    oob_score=True)
training = classifyRF.fit(train_data.iloc[:,:4], 
              train_data.iloc[:,4])
features = ['sepal length (cm)', 'sepal width (cm)', 
            'petal length(cm)', 'petal width (cm)']
class_name = ['setosa', 'versicolor', 'virginica']
# set up figure 
plt.figure(figsize = (15,15))
plot_tree(classifyRF.estimators_[0],feature_names = features, 
          class_names = class_name, filled = True, 
          fontsize = 12)
plt.show()

Figure 9.12: NN recruitment decision¶

In [26]:
#!pip3 install ann_visualizer
In [27]:
#!pip3 install keras-models
In [28]:
#Python code for the NN recruitment decision and Fig. 9.12
#imports to build neural network model and visualize
#You need to install the following Python environments:
#tensorflow and keras-models
#!pip3 install ann_visualizer
#!pip3 install keras-models
#from a terminal: pip install tensorflow 
from ann_visualizer.visualize import ann_viz
#from graphviz import Source
#from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Dense

TKS = [20,10,30,20,80,30]
CSS = [90,20,40,50,50,80]
Recruited = [1,0,0,0,1,1]
#combine multiple columns into a single set 
df = pd.DataFrame({'TKS': TKS,'CSS': CSS,
     'Recruited': Recruited})
df.head()
X = df[["TKS", "CSS"]]
y = df[["Recruited"]]
#random.seed(123). 
#The model is random due to too little training data

nn = Sequential()
nn.add(Dense(5, input_dim = 2, activation = "sigmoid"))
nn.add(Dense(1,activation = "sigmoid"))
#nn.add(Dense(2,activation = "sigmoid"))
nn.compile(optimizer = "adam", loss = "BinaryCrossentropy", 
           metrics = "BinaryAccuracy")
#fit neural network to data
nn.fit(X,y)
#visualize the neural network
ann_viz(nn, title = "Recruitment Decision")
#output the nn model weights and biases
print(nn.get_weights())

TKS = [30,51,72] #new data for decision
CSS = [85,51,30] #new data for decision
test = pd.DataFrame({'TKS': TKS,'CSS': CSS})
prediction = nn.predict(test)
print(prediction)
#[[0.70047873]
# [0.66671777]
# [0.38699177]]
#Converting probabilities into decisions
for i in range(3):
  if prediction[i] >= 0.5:
    print("Hire")
  else:
    print("Reject")
#Hire
#Hire
#Reject
2024-01-04 14:55:17.520283: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-01-04 14:55:34.448630: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
1/1 [==============================] - 0s 460ms/step - loss: 0.8451 - binary_accuracy: 0.5000
[array([[-0.33571917, -0.83156055, -0.10773458,  0.00750285, -0.6183248 ],
       [-0.2995574 , -0.24895859, -0.63877285,  0.17075612,  0.5903464 ]],
      dtype=float32), array([ 2.4525088e-04, -9.5823279e-06, -2.9454670e-06,  9.9640409e-04,
        9.8342693e-04], dtype=float32), array([[-0.34962773],
       [ 0.52244776],
       [ 0.27948168],
       [-0.8362598 ],
       [-0.5269465 ]], dtype=float32), array([0.00099999], dtype=float32)]
1/1 [==============================] - 0s 102ms/step
[[0.20388202]
 [0.28145534]
 [0.30314443]]
Reject
Reject
Reject

Figure 9.13: Curve of a logistic function¶

In [29]:
# define variables
y = np.linspace(-2,4,101)
k = 3.2
y0 = 1

# plot figure
plt.plot(y, 1/(1+ np.exp(-k*(y-y0))), 'b', linewidth = 2.5)
# add labels
plt.title('Logistic function for k = 3.2, z0 = 1.0', 
          pad = 10)
plt.xlabel("z", labelpad = 10)
plt.ylabel("g(z)", labelpad = 10)
plt.show()

Python NN code for the iris flower prediction¶

In [30]:
from ann_visualizer.visualize import ann_viz
from graphviz import Source
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split

iris = pd.read_csv("data/iris.csv") #150-by-5 iris data
#attach True or False columns to iris data
iris['setosa'] = iris['class'].map(lambda x: 
                            x == "Iris-setosa")
iris['versicolor'] = iris['class'].map(lambda x: 
                            x == "Iris-versicolor")
iris['virginica'] = iris['class'].map(lambda x: 
                            x == "Iris-virginica")

X = iris.iloc[:,0:4]#legnth and width data
y = iris.iloc[:,5:8]#species data
y["setosa"] = y["setosa"].astype(int)
y["versicolor"] = y["versicolor"].astype(int)
y["virginica"] = y["virginica"].astype(int)

#Randomly generate training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                        test_size=0.5)
nn = Sequential() #define neural network nn
nn.add(Dense(10, activation='relu', input_dim = 4))
nn.add(Dense(10, activation='relu', input_dim = 10))
nn.add(Dense(3, activation='sigmoid'))
nn.compile(optimizer = "sgd", loss = "binary_crossentropy", 
           metrics = ["accuracy"])
nn.fit(X_train, y_train, epochs = 5)#train nn
prediction = nn.predict(X_test) #use nn to predict
#output prediction result
p = prediction
for i in range(0, p.shape[0]):
    p[i] = np.where(p[i] == np.max(p[i]), True, False)
np.hstack((p, y_test)) 
#array([[1., 0., 0., 1., 0., 0.],
#       [1., 0., 0., 1., 0., 0.],
#       [1., 0., 0., 1., 0., 0.],
#       [1., 0., 0., 1., 0., 0.],
#       [0., 0., 1., 0., 0., 1.],
#       [0., 0., 1., 0., 0., 1.],
#       [1., 0., 0., 1., 0., 0.],
#       [0., 0., 1., 0., 0., 1.],
#       [1., 0., 0., 1., 0., 0.],
#       [0., 0., 1., 0., 1., 0.],
#The left 3 columns are predictions and the right 3 are validation
#The first 9 predictions are correct and the 10th is wrong
Epoch 1/5
3/3 [==============================] - 0s 2ms/step - loss: 0.6702 - accuracy: 0.3200
Epoch 2/5
3/3 [==============================] - 0s 2ms/step - loss: 0.6663 - accuracy: 0.3200
Epoch 3/5
3/3 [==============================] - 0s 2ms/step - loss: 0.6605 - accuracy: 0.3200
Epoch 4/5
3/3 [==============================] - 0s 3ms/step - loss: 0.6569 - accuracy: 0.3200
Epoch 5/5
3/3 [==============================] - 0s 3ms/step - loss: 0.6534 - accuracy: 0.3200
3/3 [==============================] - 0s 1ms/step
Out[30]:
array([[0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 1., 0., 0.],
       [0., 1., 0., 1., 0., 0.],
       [0., 1., 0., 1., 0., 0.],
       [0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 1., 0., 0.],
       [0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 1., 0., 0.],
       [0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 1., 0., 0.],
       [0., 1., 0., 1., 0., 0.],
       [0., 1., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 1., 0., 0.],
       [0., 1., 0., 1., 0., 0.],
       [0., 1., 0., 1., 0., 0.],
       [0., 1., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 1., 0., 0.],
       [0., 1., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 1., 0., 0.],
       [0., 1., 0., 1., 0., 0.],
       [0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 1., 0., 0.],
       [0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 1., 0., 0.],
       [0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 1., 0., 0.],
       [0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 1.]])