N = 3; K =2
mydata <- matrix(c(1, 1, 2, 1, 3, 3.5),
nrow = N, byrow = TRUE)
x1 = mydata[1, ]
x2 = mydata[2, ]
x3 = mydata[3, ]
#Case C1 = (P1, P2)
c1 = (mydata[1, ] + mydata[2, ])/2
c2 = mydata[3, ]
tWCSS = norm(x1 - c1, type = '2')^2 +
norm(x2 - c1, type = '2')^2 +
norm(x3 - c2, type = '2')^2
tWCSS
## [1] 0.5
#[1] 0.5
#Case C1 = (P1, P3)
c1 = (mydata[1, ] + mydata[3, ])/2
c2 = mydata[2, ]
norm(x1 - c1, type = '2')^2 +
norm(x3 - c1, type = '2')^2 +
norm(x2 - c2, type = '2')^2
## [1] 5.125
#[1] 5.125
#Case C1 = (P2, P3)
c1 = (mydata[2, ] + mydata[3, ])/2
c2 = mydata[1, ]
norm(x2 - c1, type = '2')^2 +
norm(x3 - c1, type = '2')^2 +
norm(x1 - c2, type = '2')^2
## [1] 3.625
#[1] 3.625
#The case C1 = (P1, P2) can be quickly found by
kmeans(mydata, 2)
## K-means clustering with 2 clusters of sizes 1, 2
##
## Cluster means:
## [,1] [,2]
## 1 3.0 3.5
## 2 1.5 1.0
##
## Clustering vector:
## [1] 2 2 1
##
## Within cluster sum of squares by cluster:
## [1] 0.0 0.5
## (between_SS / total_SS = 91.9 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
#Clustering vector:
#[1] 1 1 2 #points P1, P2 in C1
#setwd("~/climstats")
N = 3 #The number of data points
K = 2 #Assume K clusters
mydata = matrix(c(1, 1, 2, 1, 3, 3.5),
nrow = N, byrow = TRUE)
Kclusters = kmeans(mydata, K)
Kclusters #gives the K-means results,
## K-means clustering with 2 clusters of sizes 1, 2
##
## Cluster means:
## [,1] [,2]
## 1 3.0 3.5
## 2 1.5 1.0
##
## Clustering vector:
## [1] 2 2 1
##
## Within cluster sum of squares by cluster:
## [1] 0.0 0.5
## (between_SS / total_SS = 91.9 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
#e.g., cluster centers and WCSS
#Cluster means:
#[,1] [,2]
#1 1.5 1.0
#2 3.0 3.5
#Within cluster sum of squares by cluster:
# [1] 0.5 0.0
Kclusters$centers
## [,1] [,2]
## 1 3.0 3.5
## 2 1.5 1.0
par(mar = c(4,4,2.5,0.5))
plot(mydata[,1], mydata[,2], lwd = 2,
xlim =c(0, 4), ylim = c(0, 4),
xlab = 'x', ylab = 'y', col = c(2, 2, 4),
main = 'K-means clustering for
three points and two clusters',
cex.lab = 1.4, cex.axis = 1.4)
points(Kclusters$centers[,1], Kclusters$centers[,2],
col = c(2, 4), pch = 4)
text(1.5, 0.8, bquote(C[1]), col = 'red', cex = 1.4)
text(3.2, 3.5, bquote(C[2]), col = 'skyblue', cex = 1.4)
text(1, 1.2, bquote(P[1]), cex = 1.4, col = 'red')
text(2, 1.2, bquote(P[2]), cex = 1.4, col = 'red')
text(3, 3.3, bquote(P[3]), cex = 1.4, col = 'skyblue')
#data at Miami International Airport, Station ID USW00012839
#setwd("~/climstats")
dat = read.csv("data/MiamiIntlAirport2001_2020.csv",
header=TRUE)
dim(dat)
## [1] 7305 29
#[1] 7305 29
tmin = dat[,'TMIN']
wdf2 = dat[,'WDF2']
# plot the scatter diagram Tmin vs WDF2
#setEPS() #Plot the data of 150 observations
#postscript("fig0902a.eps", width=5, height=5)
par(mar=c(4.5, 4.5, 2, 4.5))
plot(tmin[2:366], wdf2[2:366],
pch =16, cex = 0.5,
xlab = 'Tmin [deg C]',
ylab = 'Wind Direction [deg]', grid())
title('(a) 2001 Daily Miami Tmin vs WDF2', cex.main = 1, line = 1)
axis(4, at = c(0, 45, 90, 135, 180, 225, 270, 315, 360),
lab = c('N', 'NE', 'E', 'SE', 'S', 'SW', 'W', 'NW', 'N'))
mtext('Wind Direction', side = 4, line =3)
#dev.off()
#K-means clustering
K = 2 #assuming K = 2, i.e., 2 clusters
mydata = cbind(tmin[2:366], wdf2[2:366])
fit = kmeans(mydata, K) # K-means clustering
#Output the coordinates of the cluster centers
fit$centers
## [,1] [,2]
## 1 21.93357 103.9161
## 2 18.38608 278.8608
#1 18.38608 278.8608
#2 21.93357 103.9161
fit$tot.withinss # total WCSS
## [1] 457844.9
#[1] 457844.9 for # the value may vary for each run
#Visualize the clusters by kmeans.ani()
mycluster <- data.frame(mydata, fit$cluster)
names(mycluster)<-c('Tmin [deg C]',
'Wind Direction [deg]',
'Cluster')
library(animation)
par(mar = c(4.5, 4.5, 2, 4.5))
kmeans.ani(mycluster, centers = K, pch=1:K, col=1:K,
hints = '')