Chapter 3: Decision Making and Estimation


The Python code was developed based on the R code written by
Samuel Shen, Distinguished Professor, San Diego State University, California, USA

Kaelia Okamura, Joseph Diaz, Liu Yang, and Briana Ramirez contributed to the Python code development.
Kaelia Okamura composed and finalized the code.



In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import matplotlib.dates as mdates
from datetime import datetime
import math
import os
import statistics as stats
from scipy.stats import norm
import scipy.stats as scistats
from netCDF4 import Dataset as ds
from pandas import read_table, read_csv
from scipy import stats
from scipy.stats import gamma
from sklearn.linear_model import LinearRegression
from statsmodels.distributions.empirical_distribution import ECDF
In [2]:
# Style Dictionary to standarize plotting scheme between different python scripts 
styledict = {'xtick.labelsize':21,
             'xtick.major.size':9,
             'xtick.major.width':1,
             'ytick.labelsize':21,
             'ytick.major.size':9,
             'ytick.major.width':1,
             'legend.framealpha':0.0,
             'legend.fontsize':15,
             'axes.labelsize':22,
             'axes.titlesize':22,
             'axes.linewidth':2,
             'figure.figsize':(12,8),
             'savefig.format':'jpg'}
plt.rcParams.update(**styledict)
In [3]:
# go to your working directory
os.chdir("/Users/sshen/climstats")

Figure 3.1: Histograms of data and means¶

In [4]:
#Python plot Fig. 3.1a: Histogram
mu = 0; sig = 10; m=10000; n = 100; k = m*n
x = np.random.normal(mu, sig, size = k)
plt.hist(x, bins = 201)
plt.title("Histogram of x: sd(x) = 10",
          pad = 15)
plt.xlabel("x", labelpad = 20)
plt.ylabel("Frequency", labelpad = 20)
plt.yticks([0, 10000, 20000], rotation=90)
plt.xticks([-40, -20, 0, 20, 40])
plt.tick_params(axis = 'y', length = 7, labelsize=20)
plt.tick_params(axis='x', which='both',
                bottom=False, top=False)
plt.show()
In [5]:
#Python plot Fig. 3.1b: Histogram of means
x = np.random.normal(mu, sig, size = k)
xmat = x.reshape(m,n)
xbar = np.mean(xmat, axis = 1)
np.size(xbar)
plt.hist(xbar, bins = 31)
plt.title(r'Histogram of $\bar{x}$: sd($\bar{x}$) = 1',
          pad = 15)
plt.xlabel(r"$\bar{x}$", labelpad = 20)
plt.ylabel(r"Frequency", labelpad = 20)
plt.yticks([0, 200, 400, 600, 800], rotation=90)
plt.xticks([-40, -20, 0, 20, 40])
plt.tick_params(axis = 'y', length = 7, labelsize=20)
plt.tick_params(axis='x', which='both',
                bottom=False, top=False)
plt.show()
In [6]:
#Verify 95% CI by numerical simulation: Python code
m = 10000 #10,000 experiments
n = 30 #sample size n
truemean = 8
da = np.random.normal(truemean, 1, size = (m,n))
esmean = da.mean(1)
essd = da.std(1)
upperci = esmean + 1.96*essd/np.sqrt(n)
lowerci = esmean - 1.96*essd/np.sqrt(n)
l=0
for k in np.arange(0, m):
    if upperci[k] >= truemean and lowerci[k] <= truemean :
       l = l + 1
l/m
#[1] 0.9398 #which is approximately 0.95
Out[6]:
0.9404

Figure 3.2: Confidence intervals and confidence levels¶

In [7]:
#Python plot Fig 3.2: Confidence intervals and confidence levels

# Create normal probability function
def intg(x):
    return 1/( np.sqrt(2 * np.pi))*np.exp( - x**2 /2)
a_ , b_ = -3,3 # Set desired quantities
a, b = -1.96, 1.96
a1, b1 = -1.0, 1.0 # integral limits

x = np.linspace(-3, 3)
y = intg(x)
# Set up figure
fig, ax = plt.subplots(figsize=(12,10))
ax.plot(x, y, 'black', linewidth=1)
ax.set_ylim(bottom=0)

# Make the shaded region
ix_ = np.linspace(a_,b_)
ix = np.linspace(a, b)
ix1 = np.linspace(a1, b1)

# Using the intg function
iy_ = intg(ix_)
iy = intg(ix)
iy1 = intg(ix1)

# Creating desired vertical seperations 
verts_ = [(a_, 0), *zip(ix_, iy_), (b_, 0)]
verts = [(a, 0), *zip(ix, iy), (b, 0)]
verts1 = [(a1, 0), *zip(ix1, iy1), (b1, 0)]

# Creating desired polygons
from matplotlib.patches import Polygon
poly_ = Polygon(verts_, facecolor='red', edgecolor='black', 
                linewidth=0.5)
poly =Polygon(verts,facecolor='lightblue',edgecolor='black')
poly1 =Polygon(verts1, facecolor='white', edgecolor='black')

# Adding polygon
ax.add_patch(poly_)
ax.add_patch(poly)
ax.add_patch(poly1)

# Adding appropriate text
ax.text(0.5 * (a + b), 0.24, "Probability = 0.68",
        horizontalalignment='center', verticalalignment="center", 
        fontsize=15, fontweight="bold")
ax.text(-1.55, .02, "SE", fontsize=15)
ax.text(1.40, .02, "SE", fontsize=15)
ax.text(-0.9, .02, "SE", fontsize=15)
ax.text(0.65, .02, "SE", fontsize=15)
ax.text(-0.05, .02, r'$\bar{x}$', fontsize=15)

# Creating desired arrows
plt.arrow(-2.2,.02,-0.5,0.05)
ax.text(-3,0.08,"Probability = \n 0.025", fontsize=15, 
        fontweight="bold")

# Label the figure
plt.ylabel("Probability density")
plt.xlabel("True mean as a random variable")
plt.title("Confidence Intervals and Confidence Levels")

ax.set_xticks((a,a1,0,b1,b))
ax.set_xticklabels(('','','','',''))

plt.show()

Statistics of Edmonton January Surface Air Temperature¶

In [8]:
#Python code for Edmonton Jan 1880-1929 temp anom statistics
#from the NOAAGloablTemp data
import pandas as pd
from statistics import stdev
da1 = pd.read_csv("data/Lat52.5_Lon-112.5.csv");
da1.shape
#(1642, 3) #1642 months: Jan 1880 - Oct 2016
da1.iloc[0:1,:]
#	Level Name	Date	Value
#0	NOAA Merged Land Ocean Global Surface Temperat...	1880-01-01	-7.9609
jan = np.arange(0, 1641, 12)
Tjan = np.array(da1.iloc[jan,:])
TJ50 = Tjan[0:50,2]
#print(TJ50)
xbar = np.mean(TJ50)
sdEdm = stdev(TJ50)
EM = 1.96*sdEdm/np.sqrt(50) 
CIupper = xbar + EM
CIlower = xbar - EM
np.round(np.array([xbar, sdEdm, EM, CIlower, CIupper]), 2)
#array([-2.47,  4.95,  1.38, -3.85, -1.1 ])
Out[8]:
array([-2.47,  4.95,  1.37, -3.84, -1.1 ])

Figure 3.3: Edmonton January Surface Air Temperature Anomalies¶

In [9]:
# Python plot Fig. 3.3: Edmonton January temp
#52.5N, 112.5W, Edmonton, NOAAGlobalTemp Version 3.0
import pandas as pd
from statistics import stdev
from sklearn.linear_model import LinearRegression
da1 = pd.read_csv("data/Lat52.5_Lon-112.5.csv")
jan = np.arange(0, 1641, 12)
Tjan = np.array(da1.iloc[jan,:])
Tda = Tjan[:,2]
t = np.arange(1880,2017)
t1 = pd.DataFrame(t)
Tda1 = pd.DataFrame(Tda)
reg = LinearRegression().fit(t1, Tda1)
predictions = reg.predict(t.reshape(-1, 1))
plt.plot(t1, Tda1, color = 'k')
plt.ylim(-18, 12)
plt.grid()
plt.title('Edmonton Jan Surface Air Temperature Anomalies')
plt.xlabel('Time: 1880-2016')
plt.ylabel('Temperature Anomaly [$\degree$C]')
plt.plot(t, predictions, '-', color="red")
plt.text(1885,10, 'Linear trend: 3.78$\degree$C per Century',
        fontsize = 20, color ='red')
m19972006 = np.mean(Tda1[117:137]) #1997--2016 Jan mean
m18801929 = np.mean(Tda1[0:50]) #1880--1929 Jan mean
EM = 1.96*stdev(TJ50)/np.sqrt(50) 
plt.plot(t[117:137], np.repeat(m19972006, 20), 
         color = 'blue')
plt.plot(t[0:50], np.repeat(m18801929, 50), 
         color = 'darkgreen')
plt.text(1985, -15, 
         '1997-2016' '\n' 'mean = 1.53$\degree$C',
     color = 'blue', fontsize = 20)
plt.text(1880, -15, 
         '1880-1929' '\n' 'mean = -2.47$\degree$C',
     color = 'blue', fontsize = 20)
plt.show()  
/opt/anaconda3/lib/python3.9/site-packages/numpy/core/fromnumeric.py:3438: FutureWarning: In a future version, DataFrame.mean(axis=None) will return a scalar mean over the entire DataFrame. To retain the old behavior, use 'frame.mean(axis=0)' or just 'frame.mean()'
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)

Edmonton 1997-2016 hypothesis testing example¶

In [10]:
# Python code for the Edmonton 1997-2016 hypothesis testing example
from scipy.stats import t
da1 = pd.read_csv("data/Lat52.5_Lon-112.5.csv")
jan = np.arange(0, 1641, 12)
Tjan = np.array(da1.iloc[jan,:])
Tda = Tjan[:,2]
da3 = Tda[117:137]#1997- 2016 data string
xbar = np.mean(da3) 
sdEdm = np.std(da3, ddof=1)
np.round(np.array([xbar, sdEdm]), 2)
#1.53 2.94
xbar/(sdEdm/np.sqrt(20))
#2.331112 #t-statistic ts
#calculate p-value
1 - t.cdf(2.331112, 19)
#0.01545624  # p value < the significance level 0.05 
Out[10]:
0.015456235699543908

Fig. 3.4: A schematic illustration for null and alternative hypotheses¶

In [11]:
from scipy.stats import norm
x = np.linspace(-3,6,1000)
fig = plt.figure(figsize = (20,10))
#the black left curve
left_curve = norm.pdf(x, loc = 0, scale = 1) 
plt.plot(x, left_curve, 'k')
right_curve = norm.pdf(x, loc = 2.5, scale = 1)
plt.plot(x, right_curve, 'b--') #'b--' for blue dashed lines
plt.title("Probability Density Functions of \n \
          Null and Alternative Hypotheses for Inference", 
          fontsize = 25)
#Alpha region
alpha_region = np.linspace(2,4,222)
plt.fill_between(alpha_region, left_curve[555:777], 
                 0, color = 'pink' )
plt.text(2.3, 0.05, r"$\alpha$: Significance level", 
         size = 20, color = 'r') #alpha label
plt.annotate('', xy=(2.05,0.005), xytext=(2.4,0.04), 
             arrowprops=dict(arrowstyle='->', color = 'r'))
Out[11]:
Text(2.4, 0.04, '')
In [12]:
from scipy.stats import norm
x = np.linspace(-3,6,1000)
fig = plt.figure(figsize = (20,10))
#the black left curve
left_curve = norm.pdf(x, loc = 0, scale = 1) 
plt.plot(x, left_curve, 'k')
right_curve = norm.pdf(x, loc = 2.5, scale = 1)
plt.plot(x, right_curve, 'b--') #'b--' for blue dashed lines
plt.title("Probability Density Functions of \n \
          Null and Alternative Hypotheses for Inference", 
          fontsize = 25)
#Alpha region
alpha_region = np.linspace(2,4,222)
plt.fill_between(alpha_region, left_curve[555:777], 
                 0, color = 'pink' )
plt.text(2.3, 0.05, r"$\alpha$: Significance level", 
         size = 20, color = 'r') #alpha label
plt.annotate('', xy=(2.05,0.005), xytext=(2.4,0.04), 
             arrowprops=dict(arrowstyle='->', color = 'r'))
#Beta region
beta_region = np.linspace(-1,2,333)
plt.fill_between(beta_region, right_curve[222:555],0, 
                 color = 'cyan')
plt.text(1,0.03, r"$\beta$", size = 20, color = 'b')
#Critical Value
plt.axvline(x=2, color = 'r') #vertical line
plt.text(1, -0.05, r"Critical Value $x_c$", size = 20, 
         color = 'r')
plt.plot(2, 0, 'ro', markersize = 10)
#mu 0 line
plt.axvline(x = 0, color = 'k', linestyle = '--') #for mu_0
plt.text(0, -0.05, r"$\mu_0$", size = 20)
#p value
plt.axvline(x = 2.5, color = 'b', linestyle = '--')
plt.text(2.6, 0.03, "p-value", size = 20, color = 'b')
plt.plot(2.5, 0, 'bo', markersize = 10)
plt.annotate('', xy=(2.6,0.005), xytext=(2.8,0.03), 
             arrowprops=dict(arrowstyle='->', color = 'b'))
#comment on the left side
plt.text(-3, 0.45, 'Probability density', size = 20)
plt.text(-2.8, 0.43, 'function of the', size = 20)
plt.text(-2.8, 0.41, '$H_0$ distribution', size = 20)
#comment on the right side
plt.text(4, 0.45, "Probability density" , 
         size = 20, color = 'b')
plt.text(3.6, 0.43, "function of the test statistic", 
         size = 20, color = 'b')
plt.text(4.1, 0.41, "when $H_1$ is true", 
         size = 20, color = 'b')
#Accept H0
plt.annotate('', xy=(2,0.4), xytext=(-3,0.4),
             arrowprops=dict(arrowstyle='<->')) 
#Accept H1
plt.annotate('', xy=(2,0.4), xytext=(6,0.4), 
             arrowprops=dict(arrowstyle='<->', color = 'b')) 
plt.annotate('', xy = (0, 0.45), xytext = (2.5, 0.45), 
    arrowprops = dict(arrowstyle= '<->', color = 'maroon'))                           
plt.text(0.75, 0.46, "Difference", 
         size = 20, color = 'maroon')
plt.text(-1.3, 0.1, r'$1- \alpha $: Confidence level', 
         size = 20)
plt.text(2.5, 0.1, r"$1-\beta$: Power level", 
         size = 20, color = 'g')
plt.text(4, 0.35, "Accept $H_1$", size = 20, color = 'b')
plt.text(0.75, 0.35, "Accept $H_0$", size = 20)
plt.ylim(-0.02,0.5) #limits the y axis to include more space
plt.axis('off') #turns off x and y axis
plt.show()

Fig. 3.5: AR(1) time series and auto-correlation¶

In [13]:
#establishing subplots
fig, axs = plt.subplots(2,2, figsize = (20,10)) 
fig.tight_layout(pad = 7.0)
#establishing constants
lamb = 0.9; alpha = 0.25; y0 = 4
n = 1000 #length of array
x = np.zeros(n)
w = np.random.normal(size = 1000)
x[0] = y0
for t in range(1,n):
    x[t] = x[t-1]* lamb + alpha*w[t]
#time values from 200-400
t200_400 = np.linspace(201, 400, 200)
axs[0,0].plot(t200_400, x[200:400], color = 'k')
axs[0,0].set_title(r"AR(1) Time Series: $\rho = 0.9$", 
                   size = 25)
axs[0,0].set_xlabel('Time', size = 20); 
axs[0,0].set_ylabel('x', size = 20)
axs[0,0].tick_params(axis='both', which='major', 
                     labelsize=15)
#correlation
M = 10
rho = np.zeros(M)
for m in range(M):
    rho[m] = np.corrcoef(x[0:n-m], x[m:n+1])[0,1]
axs[1,0].scatter(np.linspace(1,10,10), rho, 
                 color = 'none', edgecolor = 'k')
axs[1,0].set_ylim(0,1.2)
axs[1,0].set_xlabel('Time lag', size = 20); 
axs[1,0].set_ylabel('Auto-correlation', size = 20)
axs[1,0].set_title(r'Lagged auto-correlation : $\rho = 0.9$', 
                   size = 25)
#theoretical
rho_theory = np.zeros(M)
for m in range(M):
    rho_theory[m] = lamb**m
axs[1,0].scatter(np.linspace(1,10,10), rho_theory, 
                 color = 'b', marker = '+')
axs[1,0].tick_params(axis='both', which='major', 
                     labelsize=15)
# For lambda = 0.6
lamb = 0.6; alpha = 0.25; y0 = 4
n = 1000 #length of array
x = np.zeros(n)
w = np.random.normal(size = 1000)
x[0] = y0
for t in range(1,n):
    x[t] = x[t-1]* lamb + alpha*w[t]
#time values from 200-400
t200_400 = np.linspace(201, 400, 200)
axs[0,1].plot(t200_400, x[200:400], color = 'k')
axs[0,1].set_title(r"AR(1) Time Series: $\rho = 0.6$", 
                   size = 25)
axs[0,1].set_xlabel('Time', size = 20); 
axs[0,1].set_ylabel('x', size = 20)
axs[0,1].tick_params(axis='both', which='major', 
                     labelsize=15)
#correlation
M = 10
rho = np.zeros(M)
for m in range(M):
    rho[m] = np.corrcoef(x[0:n-m], x[m:n+1])[0,1]
axs[1,1].scatter(np.linspace(1,10,10), rho, 
                 color = 'none', edgecolor = 'k')
axs[1,1].set_ylim(0,1.2)
axs[1,1].set_xlabel('Time lag', size = 20); 
axs[1,1].set_ylabel('Auto-correlation', size = 20)
axs[1,1].set_title(r'Lagged auto-correlation : $\rho = 0.6$', 
                   size = 25)
#theoretical
rho_theory = np.zeros(M)
for m in range(M):
    rho_theory[m] = lamb**m
axs[1,1].scatter(np.linspace(1,10,10), 
                 rho_theory, color = 'b', marker = '+')
axs[1,1].tick_params(axis='both', which='major', 
                     labelsize=15)

Figure 3.6: NOAAGlobalTemp V5 1880-2019¶

In [14]:
import pandas as pd
da1 = pd.read_csv("data/NOAAGlobalTempAnn2019.csv", 
                  header = None)
print(da1.shape) # 140 years of anomalies data
print("")#leave a line of space
print(da1.iloc[0:2, 0:5]) # column 1: year; column 2: data
print("")#leave a line of space
print(da1.iloc[138:142, 0:5])
t = np.array(da1.iloc[:, 0])
Tann = np.array(da1.iloc[:,1])
lm = LinearRegression()# create linear regression
lm.fit(t.reshape(-1,1), Tann)
regAnn = lm.predict(t.reshape(-1,1))
print("")#leave a line of space
print(lm.intercept_, lm.coef_)
# plot time series
plt.plot(t, Tann, 'k', linewidth = 2)
plt.title("NOAA Global Average Annual Mean SAT Anomalies", 
          pad = 10)
plt.xlabel("Time", labelpad = 10)
plt.ylabel(r"Temperature Anomalies [$\degree$C]", 
           labelpad = 10)
plt.yticks([-0.5,0,0.5])
plt.plot(t, regAnn, 'r')# plot regression line
plt.axhline(y=0, color = 'b')# plot y = 0
# plot 1920-1949 mean
plt.axhline(y = np.mean(Tann[40:70]), 
            xmin = 0.3, xmax = 0.5, 
            color = 'limegreen', linewidth = 2)
# plot 1950-1979 mean
plt.axhline(y = np.mean(Tann[70:100]), 
            xmin = 0.5, xmax = 0.69,
            color = 'limegreen', linewidth = 2)
# plot 1980-2010 mean
plt.axhline(y = np.mean(Tann[100:130]), 
            xmin = 0.69, xmax = 0.9,
            color = 'limegreen', linewidth = 2)
# plot text
plt.text(1910,0.5, 
         r"Linear trend 0.74 $\degree$C per century", 
         size = 20, color = 'r')
plt.show()
(140, 6)

      0         1         2         3         4
0  1880 -0.432972  0.009199  0.000856  0.000850
1  1881 -0.399448  0.009231  0.000856  0.000877

        0         1         2         3         4
138  2018  0.511763  0.005803  0.000086  0.000002
139  2019  0.633489  0.005803  0.000086  0.000002

-14.741366111830043 [0.00743534]

Python code for the 1920-1949 NOAAGlobalTemp statistics¶

In [15]:
#Python code for the 1920-1949 NOAAGlobalTemp statistics
import pandas as pd
import scipy.stats
from scipy.stats import pearsonr
da = pd.read_csv("data/NOAAGlobalTempAnn2019.csv", 
                 header = None)
da1 = np.array(da.iloc[:,1])
Ta = da1[40:70]
n = 30
xbar = np.mean(Ta)
s = np.std(Ta, ddof =1)
r1, _ = pearsonr(Ta[0:29], Ta[1:30])
neff = n*(1 - r1)/(1 + r1)
print('rho1=', np.round(r1,2), 'neff=', np.round(neff, 2))
#rho1= 0.78 neff= 4 #3.677746 approximately equal to 4
neff = 4
CI1 = xbar - s/np.sqrt(n); CI2 = xbar + s/np.sqrt(n)
CI3 = xbar - s/np.sqrt(neff); CI4 = xbar + s/np.sqrt(neff)
tc = scipy.stats.t.ppf(q=0.975, df=4)
result = np.array([xbar, s, r1, CI1, CI2, CI3, CI4, tc])
print(np.round(result, 2))
#[-0.38  0.16  0.78 -0.41 -0.35 -0.46 -0.3   2.78]
rho1= 0.78 neff= 3.68
[-0.38  0.16  0.78 -0.41 -0.35 -0.46 -0.3   2.78]

Python code for the 1950-1979 and 1980-2009 statistics¶

In [16]:
#Python code for the 1950-1979 and 1980-2009 statistics
import pandas as pd
import scipy.stats
from scipy.stats import pearsonr
da = pd.read_csv("data/NOAAGlobalTempAnn2019.csv", 
                 header = None)
da1 = np.array(da.iloc[:,1])
Ta = da1[70:100] #1950-1979
n = 30
xbar = np.mean(Ta)
s = np.std(Ta)
r1, _ = pearsonr(Ta[0:29], Ta[1:30])
neff = n*(1 - r1)/(1 + r1)
print(neff) #[1] 19.02543 approximately equal to 19
neff = 19
res1 = np.array([xbar, s, r1, neff])
print(np.round(res1,2))
#-0.29  0.11  0.22 19.00

Ta = da1[100:130]#1980-2009
n = 30
xbar = np.mean(Ta)
s = np.std(Ta)
r1, _ = pearsonr(Ta[0:29], Ta[1:30])
neff = n*(1 - r1)/(1 + r1)
print(neff) #4.3224 approximately equal to 4
neff = 4
res2 = np.array([xbar, s, r1, neff])
print(np.round(res2,2))
#0.12 0.16 0.75 4.00

#t-statistic for 1950-1979 and 1980-2009 difference
ts = (-0.29 - 0.12)/np.sqrt(0.11**2/19 + 0.16**2/4)
print(ts)  # The result is -4.887592
19.025431450870226
[-0.29  0.11  0.22 19.  ]
4.322417683191189
[0.12 0.16 0.75 4.  ]
-4.887592093220167

Python code: Chi-square test for Dodge City, Kansas, USA¶

In [17]:
from scipy.stats import chi2
((9-15)**2)/15 + ((6-7)**2)/7 + ((16-9)**2)/9
#7.987302 #This is the Chi-statistic
1 - chi2.cdf(7.987302, df=2)  #Compute the tail probability
#[1] 0.01843229 #p-value
1 - chi2.cdf(5.99, df=2) #Compute the tail probability
#0.05003663 # Thus, xc = 5.99
Out[17]:
0.050036627086586294

Figure 3.7: Histogram and Gamma distribution of data¶

In [18]:
# read the data file from the folder named "data"
Omaha = pd.read_csv("data/OmahaP.csv")
print(Omaha.shape) #  data dimensions 
#(864, 7)
# change dataset into a matrix
daP = pd.DataFrame(Omaha['PRCP'])
daP = np.array(daP)
shape = (72,12)
daP = np.reshape(daP, shape)
y = daP[:, 5]
# plot the figure
fig, ax = plt.subplots(figsize=(10,8))
plt.hist(y, bins = 11, color = 'white', edgecolor = 'black')
ax.set_xlim(-10,300)
ax.set_ylim(0,20)
ax.set_title('Histogram and its Fitted Gamma Distribution\n\
for Omaha June Precip: 1948-2019', pad = 20)
ax.set_xlabel('Precipitation [mm]', labelpad = 20)
ax.set_ylabel('Frequency', labelpad = 20)
ax.set_yticks(np.linspace(0,20,5))
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_xticks([0, 50, 100, 150, 200, 250])
yl = stats.gamma.rvs(a=1.5176,scale=1/0.0189,size=1000)
xl = np.linspace(y.min(),y.max(),35)
plt.plot(xl,stats.gamma.pdf(xl,a=1.5176,scale=1/0.0189))
ax1 = ax.twinx()
# plot gamma distribution
ax1.plot(xl,stats.gamma.pdf(xl,a=1.5176,scale=1/0.0189),
         'o',color = "b")
ax1.spines['right'].set_color('blue')
ax1.spines['top'].set_visible(False)
ax1.tick_params(length=6, width=2,
                labelsize=21, color = 'b', labelcolor = 'b')
ax1.set_yticks([0.000, 0.002, 0.004, 0.006, 0.008, 0.010])
ax1.set_xticks([0, 50, 100, 150, 200, 250])
plt.show()
(864, 7)

Chi-square test for the goodness of fit¶

Recall the Chi-square statistic:

$$ \chi^{2}(71) = \sum^{12}_{i=1}\frac{(O_{i} - E_{i})^{2}}{E_{i}} = 6.2861 $$

Kolmogrov-Smirnov (KS) test for N(0,1) data, KS test, K-S test¶

In [19]:
from scipy.stats import kstest
#K-S test of normal data vs N(0,1)
x1 = np.random.normal(0,1,60)#generate 60 normal data
kstest(x1, 'norm')#perform Kolmogorov-Smirnov test
#statistic=0.07706, pvalue=0.86830
#The large p-value implies no significant difference
#K-S test of uniform data vs N(0,1)
x2 = np.random.uniform(0, 1, 60)#generate 60 uniform data
kstest(x2, 'norm')#perform Kolmogorov-Smirnov test
#statistic=.52023, pvalue=1.2440e-15
#The smallp-value implies significant difference
Out[19]:
KstestResult(statistic=0.5012864696053559, pvalue=1.7958128036142968e-14)

Python code: Chi-square test for the goodness of fit: Omaha prcp¶

In [20]:
from scipy.stats import gamma
Omaha = pd.read_csv("data/OmahaP.csv")
daP = pd.DataFrame(Omaha['PRCP'])
daP = np.array(daP)
shape = (72,12)
daP = np.reshape(daP, shape)
y = daP[:, 5] #June precipitation
n = 72 #Total number of observations
m = 12 #12 bins for the histogram in [0, 300] mm
x = np.arange(0, 301, 300/m)
p1 = gamma.cdf(x, a = 1.5176, scale = 1/0.0189)
p1[m] = 1
p2 = p1[1:(m+1)]-p1[0:m]
freq, bins = np.histogram(y, x)
Oi = freq
Ei = np.round(p2*n, 1)
count1 = np.stack((Oi, Ei), axis=0)
print(count1)
np.sum((Oi - Ei)**2/Ei)
#6.350832032876924
1 - chi2.cdf(19.68, df=11) #Compute the tail probability
#0.049927176237663295 #approximately 0.05
1 - chi2.cdf(6.3508, df=11) #=0.84896
[[ 9.  17.  16.   7.   8.   5.   5.   1.   2.   1.   1.   0. ]
 [13.  15.7 12.8  9.5  6.8  4.7  3.2  2.1  1.4  0.9  0.6  1.2]]
Out[20]:
0.8489629259885227

Figure 3.8: K-S test for Edmonton temp¶

In [21]:
#Read Edmonton data from the gridded NOAAGlobalTemp
da1 = pd.read_csv('data/EdmontonT.csv')
da1 = pd.DataFrame(da1['Value'])
da2 = np.array(da1)# turn dataset into an array
m1 = np.mean(da2)
s1 = np.std(da2)
xa = (da2 - m1)/s1
# convert xa into a 1D array
xa1 = xa.ravel()
x = np.linspace(-5,5,1642)
# fuction of ecdf made from xa
ecdf = ECDF(xa1) 
# percentiles, length of xa is 1642
vals = ecdf(np.linspace(min(x), max(x),1642)) 
fig, ax1 = plt.subplots(figsize=(12,8))
# plot cdf
ax1.plot(x, vals, 'k')
ax1.set_title("Cumulative Distributions: Data vs Model", 
              pad = 20)
ax1.set_xlabel("Temperature Anomalies [$\degree$C]",
               labelpad = 20)
ax1.set_ylabel("$F_{obs}$: Percentile/100", 
               labelpad = 20)
ax2 = ax1.twinx()
# plot ecdf
x = np.linspace(-5,5,101)
ax2.plot(x,scistats.norm.cdf(x), color = 'b')
ax2.set_xlim((-5,5))
ax2.tick_params(length=6, width=2,
                labelsize=20, color = 'b',
                labelcolor = 'b')
ax2.spines['right'].set_color('b')
ax2.text(0.5,0.3, '$D_n = max(F_{obs} - F_{exp})$',
         size = 20, color = 'r')
ax2.text(-0.3,0.2, 'K-S test: Dn = 0.0742, p-vaue = 0',
         size = 20, color = 'r')\

ax2.text(6,0.2, "$F_{exp}$: CDF of N(0,1)", 
         rotation = 'vertical', color = 'b',
         size = 22)
# add arrow
kwargs = {'color' : 'r'}
plt.arrow(-0.4,0.35, 0., -0.07, **kwargs)
# plot horizontal lines
plt.axhline(y=1, ls = '--', color = 'grey')
plt.axhline(y=0, ls = '--', color = 'grey')
plt.show()

K-S test for Omaha June precip vs Gamma¶

In [22]:
from scipy.stats import gamma
Omaha = pd.read_csv("data/OmahaP.csv")
daP = pd.DataFrame(Omaha['PRCP'])
daP = np.array(daP)
shape = (72,12)
daP = np.reshape(daP, shape)
y = daP[:, 5] #June precipitation
shape, loc, scale = gamma.fit(y, floc=0)
print('shape= ', shape, 'scale = ', scale)
#shape=  1.5178 scale =  52.9263
stats.kstest(y, 'gamma', (shape, scale))
#statistic=0.066249, pvalue=0.8893
shape=  1.5178084525433841 scale =  52.926272949549464
Out[22]:
KstestResult(statistic=0.5600307276738734, pvalue=8.302487957788162e-22)

Figure 3.9: Sensitive to outliers and R code¶

In [23]:
import numpy as np
from scipy.stats import pearsonr
# generate 50 random floats in [0,1)
# multiply each float by 0.2
x = np.append(0.2*np.random.uniform(0,1,50), 1)
y = np.append(0.2*np.random.uniform(0,1,50), 1)
plt.plot(x, y, 'o', color = 'black')
plt.show()
#t-statistic
n = 51
r, _  = pearsonr(x, y)
ts = r*(np.sqrt(n-2))/np.sqrt(1-r**2)
print(tc)
#11.1269
tc = scipy.stats.t.ppf(q=.975,df=49)
print(tc)
#2.009575 #This is critical t value
1 - scipy.stats.t.cdf(ts, df=49)
#3.05311e-14 # a very small p-value
2.7764451051977987
2.009575234489209
Out[23]:
1.1102230246251565e-15

Kendall tau test¶

In [24]:
import numpy as np
from scipy.stats import pearsonr
x = np.append(0.2*np.random.uniform(0,1,50), 1)
y = np.append(0.2*np.random.uniform(0,1,50), 1)
tau, p_value = stats.kendalltau(x, y)
print('tau =', tau, 'pvalue =', p_value)
#tau = -0.09647, 2-sided pvalue = 0.3178
tau = 0.15764705882352945 pvalue = 0.10256063658240785

Mann-Kendall test¶

In [25]:
pip install pymannkendall 
Requirement already satisfied: pymannkendall in /opt/anaconda3/lib/python3.9/site-packages (1.4.3)
Requirement already satisfied: numpy in /opt/anaconda3/lib/python3.9/site-packages (from pymannkendall) (1.21.5)
Requirement already satisfied: scipy in /opt/anaconda3/lib/python3.9/site-packages (from pymannkendall) (1.9.1)
Note: you may need to restart the kernel to use updated packages.
In [26]:
#pip install pymannkendall  
#pymannkendall is not a standard Python package
import pymannkendall as mk
#Read Edmonton data from the gridded NOAAGlobalTemp
da1 = pd.read_csv('data/EdmontonT.csv')
da1 = pd.DataFrame(da1['Value'])
da2 = np.array(da1)# turn dataset into an array
m1 = np.mean(da2)
s1 = np.std(da2)
xa = (da2 - m1)/s1
mk.original_test(xa)
#tau = 0.153, p = 0.0 #yes, a significant trend exists
#s =  206254 , var_s = 492349056
Out[26]:
Mann_Kendall_Test(trend='increasing', h=True, p=0.0, z=9.295306803981468, Tau=0.1530913460717708, s=206254.0, var_s=492349034.6666667, slope=0.0003958384653880188, intercept=-0.2367061398960399)