Chapter 2: Elementary Probability and Statistics Review¶



In [1]:
import numpy as np
import math
import pandas as pd
import statistics as stats
import scipy.stats as scistats
from scipy.stats import norm
import matplotlib.pyplot as plt
import os
from pandas import read_table, read_csv
from netCDF4 import Dataset as ds
import random
# These import statements bring in the python 
# packages we'll need. 
In [2]:
# Style Dictionary to standarize plotting 
# scheme between different python scripts 
styledict = {'xtick.labelsize':25,
             'xtick.major.size':9,
             'xtick.major.width':1,
             'ytick.labelsize':25,
             'ytick.major.size':9,
             'ytick.major.width':1,
             'legend.framealpha':0.0,
             'legend.fontsize':15,
             'axes.labelsize':20,
             'axes.titlesize':25,
             'axes.linewidth':2,
             'figure.figsize':(12,8),
             'savefig.format':'jpg'}
plt.rcParams.update(**styledict)

# define font style for text
font = {'family': 'serif',
        'color':  'blue',
        'weight': 'normal',
        'size': 12,
        }

Figure 2.1: Daily precipitation with dry and wet days¶

In [3]:
# function to convert a string of numbers into months 
def numeric_To_Named(stri):
    months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
              'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    mon = months[int(stri[5:7])-1]
    day = int(stri[8:])
    return str(day) + " " + mon
In [4]:
#Plot Fig. 2.1: Python code
os.chdir("/Users/sshen/climstats")
precipData = \
 np.array(read_csv("data/NYCDailyDatasummer2019.csv", 
                            usecols=["DATE","PRCP"]))
date = precipData[:, 0]
precip = precipData[:,1]

#Detect wet or dry day
dayNum = 92
dw = np.array([1 if p > .2 else 0 for p in precip])
n0 = np.where(dw[0:dayNum] == 0)
n1 = np.where(dw[0:dayNum] == 1)
nd = n0[0]
nw = n1[0]
#Plot the figure
date = np.arange(0,dayNum)
precip = precipData[0:dayNum,1]
kwargs = {'drawstyle' : 'steps-post'}
plt.plot(date, precip, 'black', linewidth = 2, **kwargs);
plt.plot(nd, dw[nd]-1, 'ro', markersize = 4)
plt.plot(nw,dw[nw]*5, 'bo', markersize = 4)
plt.title("Daily precipitation of New York City: \
  \n 1 Jun 2019 - 31 Aug 2019", fontweight="bold", pad = 20)
plt.ylabel(r"Precipitation [mm/day]", size=25, labelpad=20)
plt.ylim(-5,80)
ax = plt.gca()
ax.text(96,5,"— W",size=20, color = 'b')
ax.text(96,-1.0,"— D",size=20, color = 'b')
ax.text(103, 10, "Dry or Wet Days", size = 25, 
        color = 'b',rotation = 'vertical')
ax.spines['right'].set_color('b')
ax.set_xticks([0, 29, 60, 92])
ax.set_xticklabels(['1 June', '1 July', '1 Aug', '31 Aug'])
Out[4]:
[Text(0, 0, '1 June'),
 Text(29, 0, '1 July'),
 Text(60, 0, '1 Aug'),
 Text(92, 0, '31 Aug')]
In [5]:
#Generate random numbers
np.random.normal(0, 1, 4)
#array([-0.96506567,  1.02827408,  0.22863013,  0.44513761])
Out[5]:
array([ 1.64873414, -0.16073928,  0.73154401, -0.3343532 ])
In [6]:
#Generate the same random numbers at each run
#Seed remebers the previous random numbers
np.random.seed(8) #Seed number 8 
np.random.normal(0, 1, 4)
#array([ 0.09120472,  1.09128273, -1.94697031, -1.38634953])
Out[6]:
array([ 0.09120472,  1.09128273, -1.94697031, -1.38634953])
In [7]:
#Generate normal random numbers with mean=8 and SD =3
np.random.normal(8, 3, 10)
Out[7]:
array([ 1.11052528, 15.22950291, 13.18350851, 14.61366885, 10.38448292,
       10.92926329,  4.44971856, 13.74909083,  4.6300196 ,  6.00789359])

Figure 2.2: Probability and cumulative distribution functions of dry spell¶

In [8]:
#Plot Fig. 2.2: Python code
p = 0.6471
n = np.array(range(1,13))
pdfn = (1 - p)*p ** n
cdfn = 1 - p ** n

# create subplots
fig, ax = plt.subplots(2,1, figsize=(12,12))

ax[0].bar(n, pdfn, color='k', width = 0.5)
ax[0].set_title(
    'Probability Distribution Function of Dry Spell', 
     fontweight = 'bold', size = 20, pad = 20)
ax[0].set_xlabel("Length of Dry Spell: n",          
                 size = 25, labelpad = 10)
ax[0].set_ylabel('Probability', size = 25, labelpad = 20)
ax[0].text(1.5,0.22,'(a)',size=20)

ax[1].plot(n, cdfn, 'k', **kwargs, linewidth = 3)
ax[1].set_title(
    'Cumulative Distribution Function of Dry Spell', 
        fontweight = 'bold', size = 20, pad = 20)
ax[1].set_xlabel('N: Dry Spell not longer than N', 
                 size = 25, labelpad = 10)
ax[1].set_ylabel('Cumulative Probability', 
                 size = 25, labelpad = 20)
ax[1].text(1.5,0.95,'(b)',size=20)

fig.tight_layout(pad=3)
In [9]:
fig, ax = plt.subplots(2,1, figsize=(12,12))

# plot (a) pdf
ax[0].bar(n, pdfn, color='k')

ax[0].set_title("Probability Distribution Function of Dry Spell", 
                size = 20, pad = 20)
ax[0].set_xlabel("Length of Dry Spell: n",
                 size = 20, labelpad = 10)
ax[0].set_ylabel("Probability", 
                 size = 20, labelpad = 20)
ax[0].text(1.5,0.22,"(a)",size=20)
ax[0].tick_params(length=6, width=2, labelsize=20)

# plot (b) cdf
ax[1].plot(n, cdfn, 'k', **kwargs)

ax[1].set_title("Cumulative Distribution Function of Dry Spell",
                size = 20, pad = 20)
ax[1].set_xlabel("N: Dry Spell not longer than N",
                 size = 20, labelpad = 10)
ax[1].set_ylabel("Cumulative Probability", 
                 size = 20, labelpad = 20)
ax[1].text(1,0.95,"(b)",size=20)
ax[1].tick_params(length=6, width=2, labelsize=20)

fig.tight_layout(pad=3)

Figure 2.3: PDF and CDF of binomial distribution¶

In [10]:
#Plot Fig. 2.3: Python code
import scipy.stats as stats
n = 20
p = 0.3
k = np.arange(0, n + 1)
pdf = stats.binom.pmf
cdf = stats.binom.cdf
fig, ax1 = plt.subplots(figsize=(12,8))

# plot a bar graph for pdf 
## note: this is a probability plot, NOT a histogram 
ax1.bar(k, pdf(k, n, p), color='k', width = 0.2)
ax1.set_xlabel("Number of successes: k",
               size = 23, labelpad = 15)
ax1.set_ylabel("Probability", 
               size = 25, color = 'k', labelpad = 20)
ax1.set_title("Binomial Distribution: n=20, p = 0.3",
              size = 25, pad = 20)
ax1.text(10, 0.17, "CDF", color = 'b', size = 26)
ax1.tick_params(length=6, width=2, labelsize=21)
ax1.set_xticks([0,5,10,15,20])
ax1.set_yticks([0.00, 0.05, 0.10, 0.15])

ax2 = ax1.twinx()

# plot cdf
ax2.plot(k, cdf(k, n, p), 'bo-');
ax2.set_ylabel("Cumulative Probabilty", 
               size = 25, color='b', labelpad = 15)
ax2.tick_params(length=6, width=2,
                labelsize=21, color = 'b', labelcolor = 'b')
ax2.spines['right'].set_color('b')
ax2.set_ylim(0,)
ax2.text(10, 0.2, "PDF", color = 'k', size = 26)
#plt.savefig("Figure0203.eps")

fig.tight_layout()

Figure 2.4: PDF and CDF of standard normal distribution¶

In [11]:
#Plot Fig. 2.4: Python code
import scipy.stats as norm
dist = scistats.norm(0,1) # normal distribution
n = 101
b = 5
#Generate n evenly spaced numbers between [-b,b] 
x = np.linspace(-b, b, n)
pdf = dist.pdf(x) # normal probability density function
cdf = dist.cdf(x) # normal cumulative density function

#Plot the figure
fig, ax1 = plt.subplots(figsize=(12,8))

# define the interval to be shaded
interval = np.array([True if i > .85 and i < 2 \
                     else False for i in x])

# plot probability density (f(x))
ax1.plot(x, pdf, 'k')

# fill red section
ax1.fill_between(x, np.zeros(x.size), 
                 pdf, where= interval, color ='red')
ax1.set_xlabel("$x$", size = 25, labelpad = 20)
ax1.set_ylabel("Probability Density f(x)", 
               size = 22, labelpad = 20)
ax1.tick_params(length=6, width=2, labelsize=20)
ax1.set_yticks([0,0.1,0.2,0.3,0.4]);
ax1.text(1.3,.2, "$f(x)$", fontsize = 25)
ax1.text(1.7,.15, "$P_{ab} = \int_a^b f(x)\ dx$", 
         fontsize = 25, color ='red')
ax1.text(.6,.01, "$a$", fontsize = 20, color = 'red')
ax1.text(2,.01, "$b$", fontsize = 20, color ='red')
ax1.set_ylim(0,)

ax2 = ax1.twinx()

# plot cumulative probability (F(x))
ax2.plot(x, cdf, 'b--')
ax2.set_ylabel("Cumulative Probabilty $F(x)$",
               size = 22, color ='b', labelpad = 20)
ax2.tick_params(length=6, width=2, labelsize=20,
                color = 'b', labelcolor = 'b')
ax2.spines['right'].set_color('b');
ax2.text(1.7,.9, "$F(x)$", fontsize=25, color = 'blue')
ax2.set_ylim(0,)

plt.show()

Figure 2.5: Histogram and its PDF fit of normal distribution¶

In [12]:
#Plot Fig. 2.5: Histogram and PDF
# generate 200 random numbers from N(5, 3^2)
from scipy.stats import norm
x = np.random.normal(5, 3, 200)

fig, ax = plt.subplots(figsize=(12,8))

# plot histogram
counts, bins, bars = ax.hist(x, bins=17,
  range = (-5,15), color = 'gray', edgecolor = 'k');

# fit normal curve
xfit = np.linspace(x.min(), x.max(), 40)
yfit = norm.pdf(xfit, np.mean(xfit), np.std(xfit))
yfit = yfit * len(x) * ((bins[2] - bins[1]))


# plot normal curve
ax.plot(xfit, yfit, 
        color = 'k', linewidth = 2);

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

ax.tick_params(length=6, width=2, labelsize=20)
#ax.set_yticks([0, .05, 0.10, 0.15]);
ax.set_xticks(np.round(np.linspace(-5, 15, 5), 2))
ax.set_title("Histogram and Its Normal Curve Fit",
              size = 22, pad = 20);
ax.set_xlabel("Random Numbers $x$ of $N(5,3^2)$",
               size = 22, labelpad = 10);
ax.set_ylabel("Frequency/Density",
               size = 22, labelpad = 20);

# generate 200 zeros
y = np.zeros(200)

# plot random points
s = np.repeat(100,200)
plt.scatter(x, y-0.5, marker ='|', 
         lw = 2, color = 'blue')

# note that since the data is random, your histogram
# might differ from the one produced here
plt.show() 

Figure 2.6: Simulated joint and marginal distributions¶

In [13]:
#Plot Fig. 2.6: Python code
fig, ax = plt.subplots(2,2, 
             gridspec_kw={'width_ratios': [4, 1],
                         'height_ratios': [1,4]},
                       figsize=(12,8))

# generate 1,000 samples from normal distribution 
x = np.random.normal(5, 3, 1000)

# generate 1,000 samples from uniform distribution
y = np.random.uniform(0, 5, 1000)

# plot normal histogram
ax[0,0].hist(x, edgecolor = 'k',color = 'grey', 
             bins =25, density=True)

# plot uniform histogram
ax[1,1].hist(y, edgecolor = 'k',color = 'grey', 
             bins =25, density=True, 
             orientation=u'horizontal', histtype='bar')

# plot points
ax[1,0].plot(x,y, 'ko', linewidth = 2);
ax[1,0].set_xlabel("$x$", size = 25, labelpad = 20);
ax[1,0].set_ylabel("$y$", size = 25, labelpad = 15);
ax[1,0].tick_params(length=6, width=2, labelsize=20);
ax[1,0].set_xticks(np.linspace(-5,15,5))

ax[0,0].axes.get_xaxis().set_visible(False)
ax[0,0].axes.get_yaxis().set_visible(False)
ax[0,0].spines['right'].set_visible(False)
ax[0,0].spines['top'].set_visible(False)
ax[0,0].spines['left'].set_visible(False)

ax[1,1].axes.get_xaxis().set_visible(False)
ax[1,1].axes.get_yaxis().set_visible(False)
ax[1,1].spines['right'].set_visible(False)
ax[1,1].spines['top'].set_visible(False)
ax[1,1].spines['bottom'].set_visible(False)

ax[0,1].axes.get_xaxis().set_visible(False)
ax[0,1].axes.get_yaxis().set_visible(False)
ax[0,1].spines['right'].set_visible(False)
ax[0,1].spines['top'].set_visible(False)
ax[0,1].spines['left'].set_visible(False)

# note that since the data is random, your figure
# might differ from the one produced here
fig.tight_layout(pad=-1)
In [14]:
#Plot Fig. 2.6: Python code
fig, ax = plt.subplots(2,2, 
             gridspec_kw={'width_ratios': [4, 1],
                         'height_ratios': [1,4]},
                       figsize=(12,8))

# generate 1,000 samples from normal distribution 
x = np.random.normal(5, 3, 1000)

# generate 1,000 samples from uniform distribution
y = np.random.uniform(0, 5, 1000)

# plot normal histogram
ax[0,0].hist(x, edgecolor = 'k',color = 'grey', 
             bins =25, density=True)

# plot uniform histogram
ax[1,1].hist(y, edgecolor = 'k',color = 'grey', 
             bins =25, density=True, 
             orientation=u'horizontal', histtype='bar')

# plot points
ax[1,0].plot(x,y, 'ko', linewidth = 2);
ax[1,0].set_xlabel("$x$", size = 25, labelpad = 20);
ax[1,0].set_ylabel("$y$", size = 25, labelpad = 15);
ax[1,0].tick_params(length=6, width=2, labelsize=20);
ax[1,0].set_xticks(np.linspace(-5,15,5))
ax[0,0].axes.get_xaxis().set_visible(False)
ax[0,0].axes.get_yaxis().set_visible(False)
ax[0,0].spines['right'].set_visible(False)
ax[0,0].spines['top'].set_visible(False)
ax[0,0].spines['left'].set_visible(False)

ax[1,1].axes.get_xaxis().set_visible(False)
ax[1,1].axes.get_yaxis().set_visible(False)
ax[1,1].spines['right'].set_visible(False)
ax[1,1].spines['top'].set_visible(False)
ax[1,1].spines['bottom'].set_visible(False)

ax[0,1].axes.get_xaxis().set_visible(False)
ax[0,1].axes.get_yaxis().set_visible(False)
ax[0,1].spines['right'].set_visible(False)
ax[0,1].spines['top'].set_visible(False)
ax[0,1].spines['left'].set_visible(False)

# note that since the data is random, your figure
# might differ from the one produced here

fig.tight_layout(pad=-1)

Figure 2.7: PDF and CDF of Poisson distribution¶

In [15]:
#Plot Fig. 2.7: Python code
lam = 5 #mean rate for Poisson distribution
n = 20 #number of occurrence
x = np.arange(n+1)# integers from [0,n]
pdf = np.array(
 [(np.exp(-lam)*lam**k)/np.math.factorial(k) for k in x])
cdf = np.array([np.sum(pdf[:k]) for k in x])

fig, ax1 = plt.subplots(figsize=(12,8))
ax1.plot(x, pdf,'ko', ms = 10) #plot pdf
ax1.set_title('Poisson Distribution: Mean Rate= %1.1f'% lam,
              size = 22, pad = 20)
ax1.set_xlabel("k", size = 25, labelpad = 20)
ax1.set_ylabel("Probability f(k)", 
               size = 22, color = 'k', labelpad = 20)
ax1.tick_params(length=6, width=2, labelsize=21)
ax1.set_xticks([0,5,10,15,20])
ax1.set_yticks([0.00,0.05, 0.10, 0.15])

ax2 = ax1.twinx()
ax2.plot(x, cdf, 'bo', ms = 10) #plot pdf
ax2.set_ylabel("Cumulative Probabilty $F(k)$",
               size = 22, color='b', labelpad = 20)
ax2.tick_params(length=6, width=2,
                labelsize=21, labelcolor = 'b')
ax2.spines['right'].set_color('b')
ax2.set_ylim(-0.02,1.1)

fig.tight_layout()
In [16]:
cdf
Out[16]:
array([0.        , 0.00673795, 0.04042768, 0.12465202, 0.26502592,
       0.44049329, 0.61596065, 0.76218346, 0.86662833, 0.93190637,
       0.96817194, 0.98630473, 0.99454691, 0.99798115, 0.99930201,
       0.99977375, 0.99993099, 0.99998013, 0.99999458, 0.9999986 ,
       0.99999965])

Figure 2.8: Compare Cauchy and normal distributions¶

In [17]:
#Plot Fig. 2.8  
x = np.linspace(0, 20, 201)
pInv = 1/np.pi # inverse pi
# Normal density function
ygauss = np.array(
    [pInv*np.exp(-((i - 10)**2)*pInv) for i in x])
# Cauchy density function
ycauchy = np.array(
    [1/(np.pi*(1+(i - 10)**2)) for i in x])
#Plot
fig, ax = plt.subplots(figsize=(12,8))

# plot Cauchy distribution
ax.plot(x, ycauchy, 'k-', 
        label="Cauchy Distribution", linewidth=3)
# plot Gaussian distribution
ax.plot(x, ygauss, 'k--', 
  label="Normal (Gaussian) Distribution", linewidth=3)
ax.set_xlabel("$x$", size = 22, labelpad = 20)
ax.set_ylabel("Density", size = 22, labelpad = 20)
ax.tick_params(length=6, width=2, labelsize=21);
ax.set_xticks([0,5,10,15,20])
ax.set_yticks([0.00,0.10, 0.20, 0.30])
ax.set_title("Comparison between Cauchy and \
        \n Gaussian Distributions",
             size = 23, pad = 20);
ax.legend(loc = 'upper left', prop={'size':16})
fig.tight_layout()

Figure 2.9: Histogram and chi-square fit of data¶

In [18]:
#Plot Fig. 2.9: Python code
os.chdir("/Users/sshen/climstats")
da1 = pd.read_csv('data/EdmontonT.csv')
x = da1.iloc[:,2]# all rows from 2nd column of data
m1 = np.mean(x)
s1 = np.std(x)
xa = (x - m1)/s1
y = np.array(xa[0:1632]).reshape((272, 6))#vector=>matrix 
w = np.sum(y**2, axis = 1)# calculate the row sums
x = np.linspace(0,40,200)
cpdf = scistats.chi2.pdf(x, 6) # chi-square PDF
fig, ax = plt.subplots(figsize=(12,8))

# plot histogram 
ax.hist(w, bins = 41, edgecolor = 'k',
        color = 'white', linewidth=3)
ax.set_title("Histogram and its Chi-square Fit \n\
for Emonton temperature data")
ax.set_xlabel("Chi-square data [$\degree$C]$^2$",
              size = 22)
ax.set_ylabel("Frequency", size = 22, labelpad = 20)
ax.tick_params(length=6, width=2, labelsize=21)
ax.set_xticks([0,10,20,30,40])
ax.set_yticks([0,10,20,30,40,50])
ax.spines['top'].set_visible(False)
ax.text(10, 35, "Chi-squre distribution: df=6",
        size = 20, fontdict = font)

ax1 = ax.twinx()
# plot chi-square fit
ax1.plot(x,cpdf, color = 'b', linewidth = 3)
ax1.set_ylabel("Chi-square Density", size = 22, 
               color='b', labelpad = 20)
ax1.set_ylim(0,)
ax1.tick_params(length=6, width=2, color = 'b',
                labelcolor = 'b', labelsize=21)
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_color('b')
ax1.set_yticks([0.00,0.05, 0.10, 0.15])

plt.show()