Statistics and Data Visualizations in Climate Science

with R and Python

 

A Cambridge University Press Book by

SSP Shen and GR North

 

 

Version 1.0 released in July 2023 and coded by Dr. Samuel Shen, Distinguished Professor
San Diego State University, California, USA
https://shen.sdsu.edu
Email:

 

 

 

 

Chapter 3: Estimation and Decision Making

Plot Fig. 3.1: R code

#Simulation of the standard error of mean
#setwd("/Users/sshen/climstats")
#setEPS() # save the .eps figure file to the working directory
#postscript("fig0301.eps", height = 5, width = 10)
mu = 0; sig = 10; m=10000; n = 100; k = m*n
x = rnorm(k, mean = mu, sd = sig)
par(mar = c(4, 4.5, 2, 1.5))
par(mfrow=c(1,2))
hist(x, breaks = 201, xlim = c(-40,40),
     main = expression('Histogram of x: sd(x) = 10'),
     cex.lab = 1.5, cex.axis = 1.5,
     cex.main = 1.5)
text(-25, 19000, '(a)', cex =1.3)
xmat = matrix(x, ncol = n)
xbar = rowMeans(xmat)
hist(xbar, breaks = 31, xlim = c(-40,40),
     xlab = expression(bar(x)),
     main = expression('Histogram of '*bar(x)* 
                         ': sd('*bar(x)*') = 1'),
     cex.lab = 1.5, cex.axis = 1.5,
     cex.main = 1.5)
text(-25, 750, '(b)', cex =1.3)

#dev.off()

 

Verification of 95% CI by Numerical Simulation

m=10000 #10,000 experiments
x = 1:m
n = 30 #sample size n
truemean = 8
da = matrix(rnorm(n*m, mean = truemean, sd = 1), nrow = m)
esmean = rowMeans(da) #sample mean
library(matrixStats)
essd = rowSds(da) #sample SD
upperci = esmean + 1.96*essd/sqrt(n) #interval upper limit
lowerci = esmean - 1.96*essd/sqrt(n) #interval lower limit
l=0
for(k in 1:m){
  if(upperci[k] >= truemean & lowerci[k] <= truemean )
    l=l+1
} #Determine if the true mean is inside the interval
l/m #Percentage of truth
## [1] 0.9406
#[1] 0.9425 #which is approximately 0.95

 

Plot Fig. 3.2: R code

#Plot confidence intervals and tail probabilities
#setwd("/Users/sshen/climstats")
#setEPS() # save the .eps figure file to the working directory
#postscript("fig0302.eps", height = 4.5, width = 6.5)
par(mar=c(2.5,3.5,2.0,0.5))
rm(list=ls())
par(mgp=c(1.4,0.5,0))
curve(dnorm(x,0,1), xlim=c(-3,3), lwd=3,
      main='Confidence Intervals and Confidence Levels',
      xlab="True mean as a normally distributed random variable", 
      ylab="", xaxt="n", cex.lab=1.2) 
title(ylab='Probability density', line=2, cex.lab=1.2)
polygon(c(-1.96, seq(-1.96,1.96,len=100), 1.96),
        c(0,dnorm(seq(-1.96,1.96,len=100)),0),col='skyblue')
polygon(c(-1.0,seq(-1.0, 1, length=100), 1),
        c(0, dnorm(seq(-1.0, 1, length=100)), 0.0),col='white')
polygon(c(-3.0,seq(-3.0, -1.96, length=100), -1.96),
        c(0, dnorm(seq(-3.0, -1.96, length=100)), 0.0),col='red')
polygon(c(1.96,seq(1.96, 3.0, length=100), 3.0),
        c(0, dnorm(seq(1.96, 3.0, length=100)), 0.0),col='red')
points(c(-1,1), c(0,0), pch=19, col="blue")
points(0,0, pch=19)
points(c(-1.96,1.96),c(0,0),pch=19, col="red")
text(0,0.02, expression(bar(x)), cex=1.0)
text(-1.50,0.02, "SE", cex=1.0)
text(-0.60,0.02, "SE", cex=1.0)
text(1.50,0.02, "SE", cex=1.0)
text(0.60,0.02, "SE", cex=1.0)
text(0,0.2, "Probability 
     = 0.68")
arrows(-2.8,0.06,-2.35,0.01, length=0.1)
text(-2.5,0.09, "Probability
     =0.025") 

#dev.off()

 

R Code for Edmonton Jan 1880-1929 Temp Statistics

#setwd('/Users/sshen/climstats')
da1 =read.csv("data/Lat52.5_Lon-112.5.csv", header=TRUE)
dim(da1)
## [1] 1642    3
#[1] 1642    3 #1642 months: Jan 1880 - Oct 2016
da1[1:2,]
##                                                    Level.Name       Date
## 1 NOAA Merged Land Ocean Global Surface Temperature Anomalies 1880-01-01
## 2 NOAA Merged Land Ocean Global Surface Temperature Anomalies 1880-02-01
##     Value
## 1 -7.9609
## 2 -4.2510
#        Date   Value
#1 1880-01-01 -7.9609
#2 1880-02-01 -4.2510
jan = seq(1, 1642, by=12)
Tjan = da1[jan,]
TJ50 = Tjan[1:50, 3]
xbar = mean(TJ50)
sdEdm = sd(TJ50)
EM = 1.96*sdEdm/sqrt(50) 
CIupper = xbar + EM
CIlower = xbar - EM
round(c(xbar, sdEdm, EM, CIlower, CIupper), digits =2)
## [1] -2.47  4.95  1.37 -3.84 -1.10
#[1] -2.47  4.95  1.37 -3.84 -1.10

 

Plot Fig. 3.3: R code

#52.5N, 112.5W, Edmonton, NOAAGlobalTemp Version 3.0
#setwd("/Users/sshen/climstats")
#setEPS() # save the .eps figure file to the working directory
#postscript("fig0303.eps", height = 7, width = 10)
da1 =read.csv("data/Lat52.5_Lon-112.5.csv", header=TRUE)
jan =seq(1, 1642, by=12)
Tjan = da1[jan,]
t=1880:2016
regJan = lm(Tjan[,3] ~ t)
par(mar=c(3.6,4.5,2,1.3), mgp = c(2.3, 0.9, 0))
plot(t, Tjan[,3], type="l",
     main = "Edmonton January Surface Air Temperature Anomalies",
     xlab = "Time: 1880-2016", 
     ylab = expression("Temperature Anomaly:"~degree~C),
     ylim=c(-20,10), cex.lab=1.4, cex.axis=1.4)
m19972006 = mean(Tjan[118:137, 3]) #1997--2016 Jan mean
m18801929 = mean(Tjan[1:50, 3]) #1880--1929 Jan mean
EM = 1.96*sd(Tjan[1:50, 3])/sqrt(50)
lines(t, rep(0,length(t)), lty = 2)
lines(t[118:137], rep(m19972006, 20), 
      col = 'blue', lwd = 3, lty = 1)
lines(t[1:50], rep(m18801929, 50), 
      col = 'darkgreen', lwd = 3, lty = 1)
lines(t[1:50], rep(m18801929 - EM, 50), 
      col = 'darkgreen', lwd = 1.5, lty = 3)
lines(t[1:50], rep(m18801929 + EM, 50), 
      col = 'darkgreen', lwd = 1.5, lty = 3)
abline(regJan, col='red', lwd = 2)
text(1920, 9, 
     expression("Linear trend: 3.78"~degree~C~"per century"), 
     col="red", cex=1.4) 
text(2005, -17, 
     paste('1997-2016', '\nmean = ', 
           round({m19972006}, digits = 2)),
     col = 'blue', cex = 1.4)
text(1905, -17, 
     paste('1880-1929', '\nmean = ', 
           round({m18801929}, digits = 2)),
     col = 'darkgreen', cex = 1.4)