import numpy as np
import math
import pandas as pd
import statistics as stats
import scipy.stats as scistats
from scipy.stats import norm
import matplotlib.pyplot as plt
import os
from pandas import read_table, read_csv
from netCDF4 import Dataset as ds
import random
# These import statements bring in the python 
# packages we'll need.


# Style Dictionary to standarize plotting 
# scheme between different python scripts 
styledict = {'xtick.labelsize':25,
             'xtick.major.size':9,
             'xtick.major.width':1,
             'ytick.labelsize':25,
             'ytick.major.size':9,
             'ytick.major.width':1,
             'legend.framealpha':0.0,
             'legend.fontsize':15,
             'axes.labelsize':20,
             'axes.titlesize':25,
             'axes.linewidth':2,
             'figure.figsize':(12,8),
             'savefig.format':'jpg'}
plt.rcParams.update(**styledict)

# define font style for text
font = {'family': 'serif',
        'color':  'blue',
        'weight': 'normal',
        'size': 12,
        }


# function to convert a string of numbers into months 
def numeric_To_Named(stri):
    months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
              'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    mon = months[int(stri[5:7])-1]
    day = int(stri[8:])
    return str(day) + " " + mon


#Plot Fig. 2.1: Python code
os.chdir("/Users/sshen/climstats")
precipData = \
 np.array(read_csv("data/NYCDailyDatasummer2019.csv", 
                            usecols=["DATE","PRCP"]))
date = precipData[:, 0]
precip = precipData[:,1]

#Detect wet or dry day
dayNum = 92
dw = np.array([1 if p > .2 else 0 for p in precip])
n0 = np.where(dw[0:dayNum] == 0)
n1 = np.where(dw[0:dayNum] == 1)
nd = n0[0]
nw = n1[0]
#Plot the figure
date = np.arange(0,dayNum)
precip = precipData[0:dayNum,1]
kwargs = {'drawstyle' : 'steps-post'}
plt.plot(date, precip, 'black', linewidth = 2, **kwargs);
plt.plot(nd, dw[nd]-1, 'ro', markersize = 4)
plt.plot(nw,dw[nw]*5, 'bo', markersize = 4)
plt.title("Daily precipitation of New York City: \
  \n 1 Jun 2019 - 31 Aug 2019", fontweight="bold", pad = 20)
plt.ylabel(r"Precipitation [mm/day]", size=25, labelpad=20)
plt.ylim(-5,80)
ax = plt.gca()
ax.text(96,5,"— W",size=20, color = 'b')
ax.text(96,-1.0,"— D",size=20, color = 'b')
ax.text(103, 10, "Dry or Wet Days", size = 25, 
        color = 'b',rotation = 'vertical')
ax.spines['right'].set_color('b')
ax.set_xticks([0, 29, 60, 92])
ax.set_xticklabels(['1 June', '1 July', '1 Aug', '31 Aug'])

[Text(0, 0, '1 June'),
 Text(29, 0, '1 July'),
 Text(60, 0, '1 Aug'),
 Text(92, 0, '31 Aug')]


#Generate random numbers
np.random.normal(0, 1, 4)
#array([-0.96506567,  1.02827408,  0.22863013,  0.44513761])

array([ 1.64873414, -0.16073928,  0.73154401, -0.3343532 ])


#Generate the same random numbers at each run
#Seed remebers the previous random numbers
np.random.seed(8) #Seed number 8 
np.random.normal(0, 1, 4)
#array([ 0.09120472,  1.09128273, -1.94697031, -1.38634953])

array([ 0.09120472,  1.09128273, -1.94697031, -1.38634953])


#Generate normal random numbers with mean=8 and SD =3
np.random.normal(8, 3, 10)

array([ 1.11052528, 15.22950291, 13.18350851, 14.61366885, 10.38448292,
       10.92926329,  4.44971856, 13.74909083,  4.6300196 ,  6.00789359])


#Plot Fig. 2.2: Python code
p = 0.6471
n = np.array(range(1,13))
pdfn = (1 - p)*p ** n
cdfn = 1 - p ** n

# create subplots
fig, ax = plt.subplots(2,1, figsize=(12,12))

ax[0].bar(n, pdfn, color='k', width = 0.5)
ax[0].set_title(
    'Probability Distribution Function of Dry Spell', 
     fontweight = 'bold', size = 20, pad = 20)
ax[0].set_xlabel("Length of Dry Spell: n",          
                 size = 25, labelpad = 10)
ax[0].set_ylabel('Probability', size = 25, labelpad = 20)
ax[0].text(1.5,0.22,'(a)',size=20)

ax[1].plot(n, cdfn, 'k', **kwargs, linewidth = 3)
ax[1].set_title(
    'Cumulative Distribution Function of Dry Spell', 
        fontweight = 'bold', size = 20, pad = 20)
ax[1].set_xlabel('N: Dry Spell not longer than N', 
                 size = 25, labelpad = 10)
ax[1].set_ylabel('Cumulative Probability', 
                 size = 25, labelpad = 20)
ax[1].text(1.5,0.95,'(b)',size=20)

fig.tight_layout(pad=3)


fig, ax = plt.subplots(2,1, figsize=(12,12))

# plot (a) pdf
ax[0].bar(n, pdfn, color='k')

ax[0].set_title("Probability Distribution Function of Dry Spell", 
                size = 20, pad = 20)
ax[0].set_xlabel("Length of Dry Spell: n",
                 size = 20, labelpad = 10)
ax[0].set_ylabel("Probability", 
                 size = 20, labelpad = 20)
ax[0].text(1.5,0.22,"(a)",size=20)
ax[0].tick_params(length=6, width=2, labelsize=20)

# plot (b) cdf
ax[1].plot(n, cdfn, 'k', **kwargs)

ax[1].set_title("Cumulative Distribution Function of Dry Spell",
                size = 20, pad = 20)
ax[1].set_xlabel("N: Dry Spell not longer than N",
                 size = 20, labelpad = 10)
ax[1].set_ylabel("Cumulative Probability", 
                 size = 20, labelpad = 20)
ax[1].text(1,0.95,"(b)",size=20)
ax[1].tick_params(length=6, width=2, labelsize=20)

fig.tight_layout(pad=3)


#Plot Fig. 2.3: Python code
import scipy.stats as stats
n = 20
p = 0.3
k = np.arange(0, n + 1)
pdf = stats.binom.pmf
cdf = stats.binom.cdf
fig, ax1 = plt.subplots(figsize=(12,8))

# plot a bar graph for pdf 
## note: this is a probability plot, NOT a histogram 
ax1.bar(k, pdf(k, n, p), color='k', width = 0.2)
ax1.set_xlabel("Number of successes: k",
               size = 23, labelpad = 15)
ax1.set_ylabel("Probability", 
               size = 25, color = 'k', labelpad = 20)
ax1.set_title("Binomial Distribution: n=20, p = 0.3",
              size = 25, pad = 20)
ax1.text(10, 0.17, "CDF", color = 'b', size = 26)
ax1.tick_params(length=6, width=2, labelsize=21)
ax1.set_xticks([0,5,10,15,20])
ax1.set_yticks([0.00, 0.05, 0.10, 0.15])

ax2 = ax1.twinx()

# plot cdf
ax2.plot(k, cdf(k, n, p), 'bo-');
ax2.set_ylabel("Cumulative Probabilty", 
               size = 25, color='b', labelpad = 15)
ax2.tick_params(length=6, width=2,
                labelsize=21, color = 'b', labelcolor = 'b')
ax2.spines['right'].set_color('b')
ax2.set_ylim(0,)
ax2.text(10, 0.2, "PDF", color = 'k', size = 26)
#plt.savefig("Figure0203.eps")

fig.tight_layout()


#Plot Fig. 2.4: Python code
import scipy.stats as norm
dist = scistats.norm(0,1) # normal distribution
n = 101
b = 5
#Generate n evenly spaced numbers between [-b,b] 
x = np.linspace(-b, b, n)
pdf = dist.pdf(x) # normal probability density function
cdf = dist.cdf(x) # normal cumulative density function

#Plot the figure
fig, ax1 = plt.subplots(figsize=(12,8))

# define the interval to be shaded
interval = np.array([True if i > .85 and i < 2 \
                     else False for i in x])

# plot probability density (f(x))
ax1.plot(x, pdf, 'k')

# fill red section
ax1.fill_between(x, np.zeros(x.size), 
                 pdf, where= interval, color ='red')
ax1.set_xlabel("$x$", size = 25, labelpad = 20)
ax1.set_ylabel("Probability Density f(x)", 
               size = 22, labelpad = 20)
ax1.tick_params(length=6, width=2, labelsize=20)
ax1.set_yticks([0,0.1,0.2,0.3,0.4]);
ax1.text(1.3,.2, "$f(x)$", fontsize = 25)
ax1.text(1.7,.15, "$P_{ab} = \int_a^b f(x)\ dx$", 
         fontsize = 25, color ='red')
ax1.text(.6,.01, "$a$", fontsize = 20, color = 'red')
ax1.text(2,.01, "$b$", fontsize = 20, color ='red')
ax1.set_ylim(0,)

ax2 = ax1.twinx()

# plot cumulative probability (F(x))
ax2.plot(x, cdf, 'b--')
ax2.set_ylabel("Cumulative Probabilty $F(x)$",
               size = 22, color ='b', labelpad = 20)
ax2.tick_params(length=6, width=2, labelsize=20,
                color = 'b', labelcolor = 'b')
ax2.spines['right'].set_color('b');
ax2.text(1.7,.9, "$F(x)$", fontsize=25, color = 'blue')
ax2.set_ylim(0,)

plt.show()


#Plot Fig. 2.5: Histogram and PDF
# generate 200 random numbers from N(5, 3^2)
from scipy.stats import norm
x = np.random.normal(5, 3, 200)

fig, ax = plt.subplots(figsize=(12,8))

# plot histogram
counts, bins, bars = ax.hist(x, bins=17,
  range = (-5,15), color = 'gray', edgecolor = 'k');

# fit normal curve
xfit = np.linspace(x.min(), x.max(), 40)
yfit = norm.pdf(xfit, np.mean(xfit), np.std(xfit))
yfit = yfit * len(x) * ((bins[2] - bins[1]))


# plot normal curve
ax.plot(xfit, yfit, 
        color = 'k', linewidth = 2);

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

ax.tick_params(length=6, width=2, labelsize=20)
#ax.set_yticks([0, .05, 0.10, 0.15]);
ax.set_xticks(np.round(np.linspace(-5, 15, 5), 2))
ax.set_title("Histogram and Its Normal Curve Fit",
              size = 22, pad = 20);
ax.set_xlabel("Random Numbers $x$ of $N(5,3^2)$",
               size = 22, labelpad = 10);
ax.set_ylabel("Frequency/Density",
               size = 22, labelpad = 20);

# generate 200 zeros
y = np.zeros(200)

# plot random points
s = np.repeat(100,200)
plt.scatter(x, y-0.5, marker ='|', 
         lw = 2, color = 'blue')

# note that since the data is random, your histogram
# might differ from the one produced here
plt.show()


#Plot Fig. 2.6: Python code
fig, ax = plt.subplots(2,2, 
             gridspec_kw={'width_ratios': [4, 1],
                         'height_ratios': [1,4]},
                       figsize=(12,8))

# generate 1,000 samples from normal distribution 
x = np.random.normal(5, 3, 1000)

# generate 1,000 samples from uniform distribution
y = np.random.uniform(0, 5, 1000)

# plot normal histogram
ax[0,0].hist(x, edgecolor = 'k',color = 'grey', 
             bins =25, density=True)

# plot uniform histogram
ax[1,1].hist(y, edgecolor = 'k',color = 'grey', 
             bins =25, density=True, 
             orientation=u'horizontal', histtype='bar')

# plot points
ax[1,0].plot(x,y, 'ko', linewidth = 2);
ax[1,0].set_xlabel("$x$", size = 25, labelpad = 20);
ax[1,0].set_ylabel("$y$", size = 25, labelpad = 15);
ax[1,0].tick_params(length=6, width=2, labelsize=20);
ax[1,0].set_xticks(np.linspace(-5,15,5))

ax[0,0].axes.get_xaxis().set_visible(False)
ax[0,0].axes.get_yaxis().set_visible(False)
ax[0,0].spines['right'].set_visible(False)
ax[0,0].spines['top'].set_visible(False)
ax[0,0].spines['left'].set_visible(False)

ax[1,1].axes.get_xaxis().set_visible(False)
ax[1,1].axes.get_yaxis().set_visible(False)
ax[1,1].spines['right'].set_visible(False)
ax[1,1].spines['top'].set_visible(False)
ax[1,1].spines['bottom'].set_visible(False)

ax[0,1].axes.get_xaxis().set_visible(False)
ax[0,1].axes.get_yaxis().set_visible(False)
ax[0,1].spines['right'].set_visible(False)
ax[0,1].spines['top'].set_visible(False)
ax[0,1].spines['left'].set_visible(False)

# note that since the data is random, your figure
# might differ from the one produced here
fig.tight_layout(pad=-1)


#Plot Fig. 2.6: Python code
fig, ax = plt.subplots(2,2, 
             gridspec_kw={'width_ratios': [4, 1],
                         'height_ratios': [1,4]},
                       figsize=(12,8))

# generate 1,000 samples from normal distribution 
x = np.random.normal(5, 3, 1000)

# generate 1,000 samples from uniform distribution
y = np.random.uniform(0, 5, 1000)

# plot normal histogram
ax[0,0].hist(x, edgecolor = 'k',color = 'grey', 
             bins =25, density=True)

# plot uniform histogram
ax[1,1].hist(y, edgecolor = 'k',color = 'grey', 
             bins =25, density=True, 
             orientation=u'horizontal', histtype='bar')

# plot points
ax[1,0].plot(x,y, 'ko', linewidth = 2);
ax[1,0].set_xlabel("$x$", size = 25, labelpad = 20);
ax[1,0].set_ylabel("$y$", size = 25, labelpad = 15);
ax[1,0].tick_params(length=6, width=2, labelsize=20);
ax[1,0].set_xticks(np.linspace(-5,15,5))
ax[0,0].axes.get_xaxis().set_visible(False)
ax[0,0].axes.get_yaxis().set_visible(False)
ax[0,0].spines['right'].set_visible(False)
ax[0,0].spines['top'].set_visible(False)
ax[0,0].spines['left'].set_visible(False)

ax[1,1].axes.get_xaxis().set_visible(False)
ax[1,1].axes.get_yaxis().set_visible(False)
ax[1,1].spines['right'].set_visible(False)
ax[1,1].spines['top'].set_visible(False)
ax[1,1].spines['bottom'].set_visible(False)

ax[0,1].axes.get_xaxis().set_visible(False)
ax[0,1].axes.get_yaxis().set_visible(False)
ax[0,1].spines['right'].set_visible(False)
ax[0,1].spines['top'].set_visible(False)
ax[0,1].spines['left'].set_visible(False)

# note that since the data is random, your figure
# might differ from the one produced here

fig.tight_layout(pad=-1)


#Plot Fig. 2.7: Python code
lam = 5 #mean rate for Poisson distribution
n = 20 #number of occurrence
x = np.arange(n+1)# integers from [0,n]
pdf = np.array(
 [(np.exp(-lam)*lam**k)/np.math.factorial(k) for k in x])
cdf = np.array([np.sum(pdf[:k]) for k in x])

fig, ax1 = plt.subplots(figsize=(12,8))
ax1.plot(x, pdf,'ko', ms = 10) #plot pdf
ax1.set_title('Poisson Distribution: Mean Rate= %1.1f'% lam,
              size = 22, pad = 20)
ax1.set_xlabel("k", size = 25, labelpad = 20)
ax1.set_ylabel("Probability f(k)", 
               size = 22, color = 'k', labelpad = 20)
ax1.tick_params(length=6, width=2, labelsize=21)
ax1.set_xticks([0,5,10,15,20])
ax1.set_yticks([0.00,0.05, 0.10, 0.15])

ax2 = ax1.twinx()
ax2.plot(x, cdf, 'bo', ms = 10) #plot pdf
ax2.set_ylabel("Cumulative Probabilty $F(k)$",
               size = 22, color='b', labelpad = 20)
ax2.tick_params(length=6, width=2,
                labelsize=21, labelcolor = 'b')
ax2.spines['right'].set_color('b')
ax2.set_ylim(-0.02,1.1)

fig.tight_layout()

cdf

array([0.        , 0.00673795, 0.04042768, 0.12465202, 0.26502592,
       0.44049329, 0.61596065, 0.76218346, 0.86662833, 0.93190637,
       0.96817194, 0.98630473, 0.99454691, 0.99798115, 0.99930201,
       0.99977375, 0.99993099, 0.99998013, 0.99999458, 0.9999986 ,
       0.99999965])


#Plot Fig. 2.8  
x = np.linspace(0, 20, 201)
pInv = 1/np.pi # inverse pi
# Normal density function
ygauss = np.array(
    [pInv*np.exp(-((i - 10)**2)*pInv) for i in x])
# Cauchy density function
ycauchy = np.array(
    [1/(np.pi*(1+(i - 10)**2)) for i in x])
#Plot
fig, ax = plt.subplots(figsize=(12,8))

# plot Cauchy distribution
ax.plot(x, ycauchy, 'k-', 
        label="Cauchy Distribution", linewidth=3)
# plot Gaussian distribution
ax.plot(x, ygauss, 'k--', 
  label="Normal (Gaussian) Distribution", linewidth=3)
ax.set_xlabel("$x$", size = 22, labelpad = 20)
ax.set_ylabel("Density", size = 22, labelpad = 20)
ax.tick_params(length=6, width=2, labelsize=21);
ax.set_xticks([0,5,10,15,20])
ax.set_yticks([0.00,0.10, 0.20, 0.30])
ax.set_title("Comparison between Cauchy and \
        \n Gaussian Distributions",
             size = 23, pad = 20);
ax.legend(loc = 'upper left', prop={'size':16})
fig.tight_layout()


#Plot Fig. 2.9: Python code
os.chdir("/Users/sshen/climstats")
da1 = pd.read_csv('data/EdmontonT.csv')
x = da1.iloc[:,2]# all rows from 2nd column of data
m1 = np.mean(x)
s1 = np.std(x)
xa = (x - m1)/s1
y = np.array(xa[0:1632]).reshape((272, 6))#vector=>matrix 
w = np.sum(y**2, axis = 1)# calculate the row sums
x = np.linspace(0,40,200)
cpdf = scistats.chi2.pdf(x, 6) # chi-square PDF
fig, ax = plt.subplots(figsize=(12,8))

# plot histogram 
ax.hist(w, bins = 41, edgecolor = 'k',
        color = 'white', linewidth=3)
ax.set_title("Histogram and its Chi-square Fit \n\
for Emonton temperature data")
ax.set_xlabel("Chi-square data [$\degree$C]$^2$",
              size = 22)
ax.set_ylabel("Frequency", size = 22, labelpad = 20)
ax.tick_params(length=6, width=2, labelsize=21)
ax.set_xticks([0,10,20,30,40])
ax.set_yticks([0,10,20,30,40,50])
ax.spines['top'].set_visible(False)
ax.text(10, 35, "Chi-squre distribution: df=6",
        size = 20, fontdict = font)

ax1 = ax.twinx()
# plot chi-square fit
ax1.plot(x,cpdf, color = 'b', linewidth = 3)
ax1.set_ylabel("Chi-square Density", size = 22, 
               color='b', labelpad = 20)
ax1.set_ylim(0,)
ax1.tick_params(length=6, width=2, color = 'b',
                labelcolor = 'b', labelsize=21)
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_color('b')
ax1.set_yticks([0.00,0.05, 0.10, 0.15])

plt.show()

Chapter 2: Elementary Probability and Statistics Review¶

Figure 2.1: Daily precipitation with dry and wet days¶

Figure 2.2: Probability and cumulative distribution functions of dry spell¶

Figure 2.3: PDF and CDF of binomial distribution¶

Figure 2.4: PDF and CDF of standard normal distribution¶

Figure 2.5: Histogram and its PDF fit of normal distribution¶

Figure 2.6: Simulated joint and marginal distributions¶

Figure 2.7: PDF and CDF of Poisson distribution¶

Figure 2.8: Compare Cauchy and normal distributions¶

Figure 2.9: Histogram and chi-square fit of data¶