import numpy as np
import math
import statistics as stats
import scipy.stats as scistats
import scipy.odr as ordire
import matplotlib.pyplot as plt
import os
import pandas as pd
from pandas import read_table, read_csv
from netCDF4 import Dataset as ds
# These import statements bring in the python packages we'll need.
#Style Dictionary to standarize plotting scheme between different python scripts
styledict = {'xtick.labelsize':25,
'xtick.major.size':9,
'xtick.major.width':1,
'ytick.labelsize':25,
'ytick.major.size':9,
'ytick.major.width':1,
'legend.framealpha':0.0,
'legend.fontsize':15,
'axes.labelsize':20,
'axes.titlesize':25,
'axes.linewidth':2,
'figure.figsize':(12,8),
'savefig.format':'jpg'}
plt.rcParams.update(**styledict)
# go to your working directory
os.chdir("/Users/sshen/climstats")
def linregress_CIs(xd, yd, conf = 0.95):
alpha = 1. - conf # significance
n = xd.size # data sample size
x = np.linspace(xd.min(), xd.max(),100)
# Predicted values from fitted model:
a, b, r, p, err = scistats.linregress(xd, yd)
y = a*x + b
sd = 1./(n - 2.)*np.sum((yd - a*xd - b)**2)
sd = np.sqrt(sd)
sxd = np.sum((xd - xd.mean())**2)
sx = (x - xd.mean())**2
# quantile of student's t distribution for p=1-alpha/2
q = scistats.t.ppf(1.-alpha/2, n-2)
# get the upper and lower CI:
dy = q*sd*np.sqrt( 1./n + sx/sxd )
dr = q*sd*np.sqrt(1 + 1./n + sx/sxd )
rl = y - dr
ru = y + dr
yl = y - dy
yu = y + dy
return yl, yu, x, rl, ru
x = np.array([
1671.5, 1635.6, 2097.0, 1295.4, 1822.7,2396.9,2763.0,1284.7,
1525.2, 1328.6, 1378.9, 2323.8, 2757.8,1033.3,1105.5,1185.7,
2343.9, 1764.5, 1271.0, 2347.3, 2094.0,2643.2,1837.9,1121.7
])
y = np.array([
22.064, 23.591, 18.464, 23.995, 20.645,17.175,13.582,24.635,
22.178, 24.002, 23.952, 16.613, 13.588,25.645,25.625,25.828,
17.626, 22.433, 24.539, 17.364, 17.327,15.413,22.174,24.549
])
# calculate correlation coefficients
corrMatr = np.corrcoef(x, y)
# R-sqaured
Rsqu = corrMatr[0, 1]**2
# trend line
reg1 = np.array(np.polyfit(x, y, 1))
abline = reg1[1] + x*reg1[0]
fig, ax = plt.subplots(figsize=(12, 12))
ax.plot(x, y, 'ko');
ax.plot(x, abline, 'k-');
ax.set_title("Colorado Elevation vs. July Tmean: \n \
1981 - 2010 Average",
size = 25, fontweight = 'bold', pad = 20)
ax.set_xlabel("Elevation [$m$]", size = 25, labelpad = 20)
ax.set_ylabel("Temperature [$\degree$C]",
size = 25, labelpad = 20);
ax.tick_params(length=6, width=2, labelsize=20);
ax.set_xticks(np.round(np.linspace(1000, 3000, 5), 2))
ax.set_yticks(np.round(np.linspace(12, 26, 8), 2))
ax.text(1750, 25.5,
r"Temperature lapse rate: 7.0 $\degree $C/km",
color= 'k', size = 20)
ax.text(2250, 24.5, r"$y = 33.48 - 0.0070~ x$" % Rsqu,
color= 'k', size = 20)
ax.text(2250, 23.5, r"$R-squared = %.2f$" % Rsqu,
color= 'k', size = 20)
plt.show()
reg = np.polyfit(x, y, 1)
print(reg)
#[-6.98188456e-03 3.34763004e+01]
[-6.98188456e-03 3.34763004e+01]
reg = np.polyfit(x, y, 1)
print(reg) #slope and intercept
#[-6.98188456e-03 3.34763004e+01]
regfit = np.polyval(reg,x)
regresiduals = y - regfit
print(np.round(regresiduals, 5))
print(np.mean(regresiduals))
#-5.181040781584064e-16 #should be almost zero
xa = np.array(x) - np.mean(x)
np.sum(xa*regresiduals)
print(np.dot(xa, regresiduals)) #orthogonality
#-1.48929757415317e-11 #should be almost zero
np.sum((regresiduals)**2)/(y.size - 2)
#0.6096193452251238 #unbiased MSE
[-6.98188456e-03 3.34763004e+01] [ 0.25792 1.53427 -0.37129 -0.43697 -0.10542 0.43358 -0.60335 0.12833 -0.64953 -0.19817 0.10302 -0.6388 -0.63366 -0.61692 -0.13283 0.63012 0.51454 1.27623 -0.06333 0.27628 -1.52923 0.39122 1.52971 -1.09572] -5.181040781584064e-16 -1.48929757415317e-11
0.6096193452251238
plt.figure(figsize=(10, 8))# Define figure size
plt.xlim([0, 5.2]); plt.ylim([0, 2.2])
plt.axis('off')
plt.annotate('', ha = 'center', va = 'bottom',
xytext = (0, 0), xy = (4.12, 0),
arrowprops = dict(facecolor='black', lw = 4,
arrowstyle="->, head_width=0.5, head_length=3")
)
plt.annotate('', ha = 'center', va = 'bottom',
xytext = (4, 0), xy = (4, 2.13),
arrowprops = dict(facecolor='black', lw = 4,
arrowstyle="->, head_width=0.4,head_length=4.5")
)
plt.annotate('', ha = 'center', va = 'bottom',
xytext = (0, 0), xy = (4.07, 2.07),
arrowprops = dict(facecolor='black', lw = 4,
arrowstyle="->, head_width=0.5,head_length=3"))
plt.plot([0, 5], [0, 0], color ='k', linewidth = 8,
dashes = (3, 1))
plt.plot([3, 4], [0, 2], color ='k', linewidth = 3,
dashes = (3, 1))
plt.plot([5, 4], [0, 2], color ='k', linewidth = 3,
dashes = (4, 1))
plt.plot([3.9, 3.9], [0, 0.1], color ='k')
plt.plot([3.9, 4.0], [0.1, 0.1], color ='k')
plt.text(2, 0.1, r'$\hat{b}~\mathbf{x}_a$', fontsize = 30)
plt.text(2, 1.2, r'$\mathbf{y}_a$', fontsize = 30)
plt.text(4.08, 0.8, r'$\mathbf{e}$', fontsize = 30)
plt.text(3.8, 0.6, r'Shortest $\mathbf{e}$',
rotation = 90, fontsize = 20)
plt.text(3.3, 1.1, r'Longer $\mathbf{e}$',
rotation = 73, fontsize = 20)
plt.text(4.3, 1.1, r'Longer $\mathbf{e}$',
rotation = -73, fontsize = 20)
plt.show()
#Method 1: Using vector projection
xa = x - np.mean(x) #Anomaly the x data vector
nxa = np.sqrt(np.sum(xa**2)) #Norm of the anomaly vector
ya = y - np.mean(y)
nya = np.sqrt(sum(ya**2))
print(np.sum(ya*(xa/nxa))/nxa) #Compute b
#[1] -0.006981885 #This is an estimate for b
#Method 2: Using correlation
from scipy.stats import pearsonr
corxy, _ = pearsonr(xa, ya) #Correlation between xa and ya
print(corxy)
#[1] -0.9813858 #Very high correlation
print(corxy*nya/nxa) #Compute b
#[1] -0.006981885 #This is an estimate for b
-0.0069818845648987665 -0.9813858291431773 -0.006981884564898768
from statistics import variance
reg = np.polyfit(x, y, 1)
yhat = np.polyval(reg,x)
#computing MV
print('MV = ', variance(yhat))
#MV = 15.227212262175959
#Or another way
n = 24
print('MV = ',
np.sum((yhat - np.mean(yhat))**2)/(n-1))
#MV = 15.227212262175959
#computing YV
print('YV =', np.sum((y - np.mean(y))**2)/(n-1))
# [1] 15.81033
#Or another way
print('YV =', variance(y))
#YV = 15.81032641847826
#computing R-squared value
print('R-squared = ', variance(yhat)/variance(y))
#R-squared = 0.9631181456430407
#computing correlation and R-squared
from scipy.stats import pearsonr
corxy, _ = pearsonr(x, y)
print('Correlation r_xy = ', corxy)
#Correlation r_xy = -0.9813858291431772
print('R-squared = ', corxy**2)
#R-squared = 0.9631181456430413
MV = 15.227212262175959 MV = 15.227212262175959 YV = 15.810326418478262 YV = 15.81032641847826 R-squared = 0.9631181456430407 Correlation r_xy = -0.9813858291431772 R-squared = 0.9631181456430413
reg1 = np.array(np.polyfit(x, y, 1))#regression
abline = reg1[1] + x*reg1[0]
# get confidence intervals
yl,yu,xd,rl,ru = linregress_CIs(x,y)
# plot the figure
fig, ax = plt.subplots(figsize=(13,12))
ax.plot(x,y, 'ko');
ax.plot(x, abline, 'k-');
ax.plot(xd, yu, 'r-')
ax.plot(xd, yl, 'r-')
ax.plot(xd, ru, 'b-')
ax.plot(xd, rl, 'b-')
ax.set_title("Colorado Elevation vs. July Tmean \n \
1981 - 2010 Average",size=25, pad = 20,
fontweight = 'bold')
ax.set_xlabel("Elevation [m]", size = 25, labelpad = 20)
ax.set_ylabel(r"Temperature [$\degree$C]",
size = 25, labelpad = 20);
ax.tick_params(length=6, width=2, labelsize=20);
ax.set_xticks(np.round(np.linspace(1000, 3000, 5), 2))
ax.set_yticks(np.round(np.linspace(10, 30, 5), 2))
ax.text(1600, 25, r"Temp lapse rate: 7.0$\degree$C/km",
color= 'k', size = 20)
ax.text(1750, 27,
r"$y = %1.2f %1.4f ~x$"%(reg1[1], reg1[0]),
size = 20)
ax.text(1800, 26, r"R-squared $= %.2f$" % Rsqu,
color= 'k', size = 20)
ax.text(1050, 15, "Blue lines: CI of July Tmean RV",
color= 'b', size = 20)
ax.text(1050, 14, "Red lines: CI of the fitted model",
color= 'r', size = 20)
plt.show()
reg1 = np.array(np.polyfit(x, y, 1))#regression
abline = reg1[1] + x*reg1[0]
# calculate residuals
r = y - abline
fig, ax = plt.subplots(figsize=(12,8))
ax.plot(x, r, 'kd')
ax.set_title("Residuals of the Colorado 1981 - 2010 July \n\
Tmean vs. Elevation",fontweight = 'bold', size=25, pad = 20)
ax.set_xlabel("Elevation [m]", size = 25, labelpad = 20)
ax.set_ylabel(r"Residual Temp [$\degree$C]",
size = 25, labelpad = 20);
ax.tick_params(length=6, width=2, labelsize=20);
ax.set_xticks(np.linspace(1000, 3000, 5))
ax.set_yticks(np.linspace(-2,2,5))
plt.show()
fig, ax = plt.subplots(figsize=(12,8))#fig setup
reg1 = np.array(np.polyfit(x, y, 1))#regression
abline = reg1[1] + x*reg1[0]
r = y - abline# calculate residuals
#Q-Q plot as a probability plot: quantiles vs quantiles
pp1 = scistats.probplot(r, dist="norm", plot=ax,)
ax.set_title("QQ-Normal Plot for Colorado TLR Residuals",
fontweight = 'bold',size = 25, pad = 20);
ax.set_ylabel("Sample Quantiles", size = 25, labelpad = 20);
ax.set_xlabel("Theoretical Quantiles",
size = 25, labelpad = 20);
ax.tick_params(length=6, width=2, labelsize=20);
ax.set_xticks(np.round(np.linspace(-2, 2, 5), 2))
plt.show()
#Black-white version of the Q-Q plot
fig, ax = plt.subplots(figsize=(12,8))#fig setup
reg1 = np.array(np.polyfit(x, y, 1))#regression
abline = reg1[1] + x*reg1[0]
r = y - abline# calculate residuals
#Q-Q plot as a probability plot: quantiles vs quantiles
pp1 = scistats.probplot(r, dist="norm", plot=ax,)
#modify the color, ticks and labels of the probability plot
ax.get_lines()[0].set_marker('d')
ax.get_lines()[0].set_color('k')
ax.get_lines()[1].set_color('k')
ax.get_lines()[1].set_linestyle('--')
ax.set_title("QQ-Normal Plot for the Colorado TLR Residuals",
fontweight = 'bold',size = 25, pad = 20);
ax.set_ylabel("Sample Quantiles", size = 25, labelpad = 20);
ax.set_xlabel("Theoretical Quantiles", size = 25, labelpad = 20);
ax.tick_params(length=6, width=2, labelsize=20);
ax.set_xticks(np.round(np.linspace(-2, 2, 5), 2))
plt.show()
from statsmodels.stats.stattools import durbin_watson
orderET = np.arange(1,25)
ElevTemp = np.stack((x,y, orderET), axis = -1)
sorted_ind = np.argsort(ElevTemp[:,0],kind='mergesort')
dat1 = ElevTemp[sorted_ind]
# use the first order polynomial fit for linear regression
reg1 = np.array(np.polyfit(dat1[:,0], dat1[:,1], 1))
abline = reg1[1] + dat1[:,0]*reg1[0]
r = dat1[:,1] - abline# calculate residuals
#perform Durbin-Watson test
durbin_watson(r)
#2.307190038542777
2.307190038542777
import pymannkendall as mk
orderET = np.arange(1,25)
ElevTemp = np.stack((x,y, orderET), axis = -1)
sorted_ind = np.argsort(ElevTemp[:,0],kind='mergesort')
dat1 = ElevTemp[sorted_ind]
# use the first order polynomial fit for linear regression
reg1 = np.array(np.polyfit(dat1[:,0], dat1[:,1], 1))
abline = reg1[1] + dat1[:,0]*reg1[0]
r = dat1[:,1] - abline# calculate residuals
#perform Durbin-Watson test
dat1[:,2] = r
print(mk.original_test(dat1[:,2]))#test for residual trend
#p=0.6374381847429542, z=0.47128365713220055
print(mk.original_test(dat1[:,1]))#test for temp trend
#p=2.260863496417187e-09, z=-5.97786112467686
Mann_Kendall_Test(trend='no trend', h=False, p=0.6374381847429542, z=0.47128365713220055, Tau=0.07246376811594203, s=20.0, var_s=1625.3333333333333, slope=0.011948809394942904, intercept=-0.22178354930674668) Mann_Kendall_Test(trend='decreasing', h=True, p=2.260863496417187e-09, z=-5.97786112467686, Tau=-0.8768115942028986, s=-242.0, var_s=1625.3333333333333, slope=-0.5298190476190479, intercept=28.26891904761905)
x = np.array([
1671.5, 1635.6, 2097.0, 1295.4, 1822.7,2396.9,2763.0,1284.7,
1525.2, 1328.6, 1378.9, 2323.8, 2757.8,1033.3,1105.5,1185.7,
2343.9, 1764.5, 1271.0, 2347.3, 2094.0,2643.2,1837.9,1121.7
])
y = np.array([
22.064, 23.591, 18.464, 23.995, 20.645,17.175,13.582,24.635,
22.178, 24.002, 23.952, 16.613, 13.588,25.645,25.625,25.828,
17.626, 22.433, 24.539, 17.364, 17.327,15.413,22.174,24.549
])
from sklearn import linear_model
lat=np.array([
39.9919, 38.4600, 39.2203, 38.8236, 39.2425, 37.6742,
39.6261, 38.4775, 40.6147, 40.2600, 39.1653, 38.5258,
37.7717, 38.0494, 38.0936, 38.0636, 37.1742, 38.4858,
8.0392, 38.0858, 40.4883, 37.9492, 37.1786, 40.0583
])
lon=np.array([
-105.2667,-105.2256,-105.2783,-102.3486,-107.9631,-106.3247,
-106.0353,-102.7808,-105.1314,-103.8156,-108.7331,-106.9675,
-107.1097,-102.1236,-102.6306,-103.2153,-105.9392,-107.8792,
-103.6933,-106.1444,-106.8233,-107.8733,-104.4869,-102.2189
])
elev = x; temp = y #The x and y data were entered earlier
dat = np.stack((lat, lon, elev), axis = -1)
print(dat[0:2,:]) #Show the data of the first two stations
#[[ 39.9919 -105.2667 1671.5 ]
# [ 38.46 -105.2256 1635.6 ]]
#Multivariate linear regression
xdat = dat
ydat = temp
regr = linear_model.LinearRegression()
multi_reg = regr.fit(xdat, ydat)
print('Intercept: \n', regr.intercept_)
#23.73406772023514
print('Coefficients: \n', regr.coef_)
#[-0.01070055 -0.10029741 -0.00721227]
[[ 39.9919 -105.2667 1671.5 ] [ 38.46 -105.2256 1635.6 ]] Intercept: 23.73406772023514 Coefficients: [-0.01070055 -0.10029741 -0.00721227]
# Change current working directory
os.chdir("/Users/sshen/climstats/")
# Read the data
dtmean = np.array(read_table(
"data/aravg.ann.land_ocean.90S.90N.v5.0.0.201909.txt",
header = None, delimiter = "\s+"))
xDT = dtmean[0:139, 0]
yDT = dtmean[0:139, 1]
# Make a linear fit, i.e., linear regression
regLin = np.array(np.polyfit(xDT, yDT, 1))
ablinereg = regLin[1] + xDT*regLin[0]
yld,yud,xdd,rld,rud = linregress_CIs(xDT,yDT)
# Plot the figure
fig, ax = plt.subplots(2, 1, figsize=(12,12))
ax[0].plot(xDT, yDT, 'ko-')
ax[0].plot(xDT, ablinereg, 'k-')
ax[0].plot(xdd, yld, 'r-')
ax[0].plot(xdd, yud, 'r-')
ax[0].plot(xdd, rld, 'b-')
ax[0].plot(xdd, rud, 'b-')
ax[1].plot(xDT, yDT - ablinereg, 'kd')
ax[0].set_title(
"Global Annual Mean Land and Ocean Surface \n\
Temperature Anomolies", fontweight = 'bold',
size = 25, pad = 20)
ax[0].set_ylabel("Temperature [$\degree$C]",
size = 25, labelpad = 20)
ax[0].tick_params(length=6, width=2, labelsize=20);
ax[0].set_yticks(np.round(np.linspace(-1, 1, 5), 2))
ax[0].text(1880, 0.4,
"Linear temp trend 0.7348 deg C per century",
color = 'k', size = 20)
ax[0].text(1877, 0.6, "(a)", size = 20)
ax[0].axes.get_xaxis().set_visible(False)
ax[1].tick_params(length=6, width=2, labelsize=20);
ax[1].set_yticks(np.round(np.linspace(-0.5, 0.5, 5), 2))
ax[1].set_ylabel("Residuals [$\degree$C]",
size = 25, labelpad = 20)
ax[1].text(1877, 0.4, "(b)", size = 20)
ax[1].set_xlabel("Year", size = 25, labelpad = 20)
fig.tight_layout(pad=-1.5)
plt.show()
#Kolmogorov-Smirnov (KS) test for normality
import statistics
from scipy.stats import kstest
resi = yDT - ablinereg
testvar = (resi - np.mean(resi))/statistics.stdev(resi)
kstest(testvar, 'norm')#
#KstestResult(statistic=0.0751, pvalue=0.3971)
#The normality assumption is accepted
#perform Durbin-Watson test for independence
from statsmodels.stats.stattools import durbin_watson
durbin_watson(resi)
#0.45234915992769864 not in (1.5, 2.5)
#The independence assumption is rejected
#calculate autocorrelations of temp data for edof
import statsmodels.api as sm
autocorr = sm.tsa.acf(yDT)
rho1 = autocorr[1]
edof = (yDT.size - 2)*(1 - rho1)/(1 + rho1)
edof #[1] 5.183904 effective degrees of freedom
#Compare the crtical t values with different edof
import scipy.stats
scipy.stats.t.ppf(q=.975, df=137)
#critical t value 1.9774312122928936
scipy.stats.t.ppf(q=.975, df=5)
##critical t value 2.5705818366147395
2.5705818366147395
os.chdir("/Users/sshen/climstats/")
# Read the data
dtmean = np.array(read_table(
"data/aravg.ann.land_ocean.90S.90N.v5.0.0.201909.txt",
header = None, delimiter = "\s+"))
xDT = dtmean[0:139, 0]
yDT = dtmean[0:139, 1]
# Create trend line
reg3 = np.array(np.polyfit(xDT, yDT, 3))
ablineDT3 = reg3[3] + xDT*reg3[2] + \
(xDT**2)*reg3[1] + (xDT**3)*reg3[0]
fig, ax = plt.subplots(2, 1, figsize=(12,12))
ax[0].plot(xDT, yDT, 'ko-')
ax[0].plot(xDT, ablineDT3, 'k-', linewidth = 4)
ax[1].plot(xDT, yDT - ablineDT3, 'kd')
ax[0].set_title("Global Annual Mean Land and Ocean \n\
Surface Temperature Anomolies",
fontweight = 'bold', size = 25, pad = 20)
ax[0].tick_params(length=6, width=2, labelsize=20);
ax[0].set_yticks(np.round(np.linspace(-1, 1, 5), 2))
ax[0].set_ylabel("Temperature [$\degree$C]",
size = 25, labelpad = 20)
ax[0].text(1900, 0.3, "The third-order polynomial fit",
color = 'k', size = 20)
ax[0].text(1877, 0.6, "(a)", size = 20)
ax[0].axes.get_xaxis().set_visible(False)
ax[1].tick_params(length=6, width=2, labelsize=20);
ax[1].set_yticks(np.round(np.linspace(-0.4, 0.4, 5), 2))
ax[1].set_ylabel("Residuals [$\degree$C]",
size = 25, labelpad = 20)
ax[1].set_xlabel("Year", size = 25, labelpad = 20)
ax[1].text(1877, 0.35, "(b)", size = 20)
fig.tight_layout(pad=-1.5)
plt.show()
guess = np.polyfit(xDT, yDT, 9)
# Define polynomial funciton
def fit_func(p, t):
return p[0]*t**9 + p[1]*t**8 + p[2]*t**7 + \
p[3]*t**6 + p[4]*t**5 + p[5]*t**4 + p[6]*t**3 +\
p[7]*t**2 + p[8]*t + p[9]
# Fit the model
Model = ordire.Model(fit_func)
Data = ordire.RealData(xDT, yDT)
Odr = ordire.ODR(Data, Model, guess, maxit = 10000000)
output = Odr.run()
beta = output.beta
fig, ax = plt.subplots(2, 1, figsize=(12,12))
ax[0].plot(xDT, yDT, 'ko-')
ax[0].plot(xDT, fit_func(beta, xDT), 'k-', linewidth = 4)
ax[1].plot(xDT, yDT - fit_func(beta, xDT), 'kd')
ax[0].set_title("Global Annual Mean Land and Ocean \n \
Surface Temperature Anomolies",
fontweight = 'bold', size = 25, pad = 20)
ax[0].tick_params(length=6, width=2, labelsize=20);
ax[0].set_ylabel("Temperature [deg C]", size = 25, labelpad = 20)
ax[0].text(1900, .4, "Ninth-order Polynomial Fit",
color = 'k', size = 20)
ax[0].text(1877, 0.6, "(a)", size = 20)
ax[0].axes.get_xaxis().set_visible(False)
ax[1].tick_params(length=6, width=2, labelsize=20);
ax[1].set_ylabel("Residuals [deg C]", size = 25, labelpad = 20)
ax[1].set_xlabel("Year", size = 25, labelpad = 20)
ax[1].text(1877, 0.25, "(b)", size = 20)
fig.tight_layout(pad=-1.5)
plt.show()
/opt/anaconda3/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3457: RankWarning: Polyfit may be poorly conditioned exec(code_obj, self.user_global_ns, self.user_ns)