Statistics and Data Visualizations in Climate Science

with R and Python

 

A Cambridge University Press Book by

SSP Shen and GR North

 

 

Version 1.0 released in July 2023 and coded by Dr. Samuel Shen, Distinguished Professor
San Diego State University, California, USA
https://shen.sdsu.edu
Email:

 

 

 

 

Chapter 4: Regression Models and Methods

Plot Fig. 4.1: Colorado Temperature Lapse Rate: R code

x= c(
1671.5, 1635.6, 2097.0, 1295.4, 1822.7, 2396.9, 2763.0, 1284.7,
1525.2, 1328.6, 1378.9, 2323.8, 2757.8, 1033.3, 1105.5, 1185.7,
2343.9, 1764.5, 1271.0, 2347.3, 2094.0, 2643.2, 1837.9, 1121.7)
y= c(
22.064, 23.591, 18.464, 23.995, 20.645, 17.175, 13.582, 24.635,
22.178, 24.002, 23.952, 16.613, 13.588, 25.645, 25.625, 25.828,
17.626, 22.433, 24.539, 17.364, 17.327, 15.413, 22.174, 24.549)
#setEPS() # save the .eps figure 
#postscript("fig0401.eps",  width = 8)
par(mar=c(4.5,4.5,2.5,0.5))
plot(x,y, 
     xlab="Elevation [m]",
     ylab=expression("Temperature ["~degree~"C]"),
  main="Colorado Elevation and July Tmean: 1981-2010 Average",
     cex.lab=1.5, cex.axis=1.5, cex.main =1.2)
reg=lm(y~x)
reg
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##   33.476300    -0.006982
#(Intercept)            x  
# 33.476216    -0.006982   #-7.0 degC/1km
summary(reg)
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.52923 -0.60674 -0.08437  0.40181  1.53427 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 33.4763004  0.5460279   61.31   <2e-16 ***
## x           -0.0069819  0.0002913  -23.97   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7808 on 22 degrees of freedom
## Multiple R-squared:  0.9631, Adjusted R-squared:  0.9614 
## F-statistic: 574.5 on 1 and 22 DF,  p-value: < 2.2e-16
#R-squared:  0.9631
abline(reg,lwd=3)
text(2100, 25.5, 
expression("Temperature lapse rate: 7.0"~degree~"C/1.0km"), 
     cex=1.5)
text(2350, 24, "y = 33.48 - 0.0070 x", cex=1.5)
text(2350, 22.5,"R-squared = 0.96", cex=1.5)

#dev.off()

 

R Code for the Colorado TLR Regression Analysis

lm(y ~ x)
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##   33.476300    -0.006982
#(Intercept)            x  
#33.476216    -0.006982 
reg = lm(y ~ x)
round(reg$residuals, digits = 5)
##        1        2        3        4        5        6        7        8 
##  0.25792  1.53427 -0.37129 -0.43697 -0.10542  0.43358 -0.60335  0.12833 
##        9       10       11       12       13       14       15       16 
## -0.64953 -0.19817  0.10302 -0.63880 -0.63366 -0.61692 -0.13283  0.63012 
##       17       18       19       20       21       22       23       24 
##  0.51454  1.27623 -0.06333  0.27628 -1.52923  0.39122  1.52971 -1.09572
#       1        2        3        4        5        6 
#0.25792  1.53427 -0.37129 -0.43697 -0.10542  0.43358 
#7        8        9       10       11       12 
#-0.60335  0.12833 -0.64953 -0.19817  0.10302 -0.63880 
#13       14       15       16       17       18 
#-0.63366 -0.61692 -0.13283  0.63012  0.51454  1.27623 
#19       20       21       22       23       24 
#-0.06333  0.27628 -1.52923  0.39122  1.52971 -1.09572 

mean(reg$residuals)
## [1] 1.62043e-17
#[1] 1.62043e-17

xa = x - mean(x)
sum(xa*reg$residuals)
## [1] -2.83773e-13
#[1] -2.83773e-13

sum((reg$residuals)^2)/(length(y) -2)
## [1] 0.6096193
#[1] 0.6096193

 

Plot Fig. 4.2: R code

#setEPS() 
#postscript("fig0402.eps",  height = 5, width = 8)
par(mar=c(0.0,0.5,0.0,0.5))
plot(0,0, xlim=c(0,5.2), ylim=c(0,2.2),
     axes = FALSE, xlab="", ylab="")
arrows(0,0,4,0, angle=5, code=2, lwd=3, length=0.5)
arrows(4,0,4,2, angle=5, code=2, lwd=3, length=0.5)
arrows(0,0,4,2, angle=5, code=2, lwd=3, length=0.5)
arrows(5,0,4,2, angle=7, code=2, lwd=2, lty=3, length=0.5)
arrows(0,0,5,0, angle=7, code=2, lwd=2, lty=3, length=0.5)
arrows(3,0,4,2, angle=7, code=2, lwd=2, lty=3, length=0.5)
arrows(0,0,3,0, angle=7, code=2, lwd=2, lty=3, length=0.5)
segments(3.9,0, 3.9, 0.1)
segments(3.9, 0.1, 4.0, 0.1)
text(2,0.2, expression(hat(b)~bold(x)[a]), cex=2)
text(2,1.2, expression(bold(y)[a]), cex=2)
text(4.1,1, expression(bold(e)), cex=2)
text(3.8,0.6, expression(paste("Shortest ",bold(e))), 
     cex=1.5, srt=90)
text(3.4,1.1, expression(paste("Longer ",bold(e))), 
     cex=1.5, srt=71)
text(4.6,1.1, expression(paste("Longer ",bold(e))), 
     cex=1.5, srt=-71)

#dev.off()

 

R Code for Estimating Regression Slope b

#Method 1: Using vector projection
xa = x - mean(x)  #Anomaly the x data vector
nxa = sqrt(sum(xa^2)) #Norm of the anomaly data vector
ya = y - mean(y)
nya=sqrt(sum(ya^2))
sum(ya*(xa/nxa))/nxa #Compute b
## [1] -0.006981885
#[1] -0.006981885  #This is an estimate for b

#Method 2:  Using correlation: R code
corxy=cor(xa, ya) #Compute the correlation between xa and ya
corxy
## [1] -0.9813858
#[1] -0.9813858 #Very high correlation
corxy*nya/nxa #Compute b
## [1] -0.006981885
#[1] -0.006981885 #This is an estimate for b

 

R Code for Computing MV

var(reg$fitted.values)
## [1] 15.22721
#[1] 15.22721 
#Or another way
yhat = reg$fitted.values
var(yhat)
## [1] 15.22721
#[1] 15.22721 
#Or still another way
n = 24
sum((yhat - mean(yhat))^2)/(n-1)
## [1] 15.22721
#[1] 15.22721 

 

R Code for Computing YV

sum((y - mean(y))^2)/(n-1)
## [1] 15.81033
# [1] 15.81033  
#Or another way
var(y)
## [1] 15.81033
#[1] 15.81033

 

R Code for Computing R-squared value

var(reg$fitted.values)/var(y)
## [1] 0.9631181
#[1] 0.9631181  #This is the R-squared value

cor(x,y)
## [1] -0.9813858
#[1] -0.9813858
(cor(x,y))^2
## [1] 0.9631181
#[1] 0.9631181 #This is the R-squared value


#
#
qt(c(.025, .975), df=22)
## [1] -2.073873  2.073873
#[1] -2.073873  2.073873

summary(reg)
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.52923 -0.60674 -0.08437  0.40181  1.53427 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 33.4763004  0.5460279   61.31   <2e-16 ***
## x           -0.0069819  0.0002913  -23.97   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7808 on 22 degrees of freedom
## Multiple R-squared:  0.9631, Adjusted R-squared:  0.9614 
## F-statistic: 574.5 on 1 and 22 DF,  p-value: < 2.2e-16
-0.0069818 - 2.073873 * 0.0002913
## [1] -0.007585919
#[1] -0.007585919
-0.0069818 + 2.073873 * 0.0002913
## [1] -0.006377681
#[1] -0.006377681

33.4762157 - 2.073873*0.5460279
## [1] 32.34382
#[1] 32.34382
33.4762157 + 2.073873*0.5460279
## [1] 34.60861
#[1] 34.60861

(-0.0069818-(-0.0073))/0.0002913
## [1] 1.092345
#[1] 1.092345

 

R Plot Fig. 4.3: Confidence Interval of a Regression model

#setwd("/Users/sshen/climstats")
#Confidence interval of the linear model
x1 = seq(max(x), min(x),len=100)
n = 24
xbar = mean(x)
reg = lm(y ~ x)
SSE = sum((reg$residuals)^2)
s_squared = SSE/(length(y)-2)
s = sqrt(s_squared)
modTLR = 33.476216 + -0.006982*x1
xbar = mean(x)
Sxx = sum((x-xbar)^2)
CIupperModel= modTLR + qt(.975, df=n-2)*s*sqrt((1/n)+(x1-xbar)^2/Sxx)
CIlowerModel= modTLR - qt(.975, df=n-2)*s*sqrt((1/n)+(x1-xbar)^2/Sxx)
CIupperResponse= modTLR + qt(.975, df=n-2)*s*sqrt(1+(1/n)+(x1-xbar)^2/Sxx)
CIlowerResponse= modTLR - qt(.975, df=n-2)*s*sqrt(1+(1/n)+(x1-xbar)^2/Sxx)

#setEPS() #Plot the figure and save the file
#postscript("fig0403.eps", height = 8, width = 8)
par(mar=c(4.5,4.5,2.0,0.5))
plot(x,y, 
     ylim=c(10,30), xlim=c(1000,3000),
     xlab="Elevation [m]",
     ylab=bquote("Temperature ["~degree~"C]"),
     main="Colorado Elevation and July Tmean: 1981-2010 Average",
     cex.lab=1.5, cex.axis=1.5)
lines(x1,CIupperModel,type="l",col='red')
lines(x1,CIlowerModel,type="l",col='red')
lines(x1,CIupperResponse,type="l",col='blue')
lines(x1,CIlowerResponse,type="l",col='blue')
abline(reg,lwd=3)
text(2280, 26, 
bquote("Temperature lapse rate: 7.0"~degree~"C/km"), 
cex=1.5)
text(2350, 27.5,"R-squared = 0.96", cex=1.5)
text(2350, 29, "y= 33.48 - 0.0070 x", cex=1.5)
text(1600, 15,"Blue lines: CI of July Tmean RV", 
     col="blue", cex=1.5)
text(1600, 13.5,"Red lines: CI of the fitted model", 
     col="red", cex=1.5)