Chapter 7: Introduction to Time Series


This Python code was developed based on the R code written by
Samuel Shen, Distinguished Professor, San Diego State University, California, USA

Sandra Villamar and Kaelia Okamura contributed to the Python code development.
Kaelia Okamura composed and finalized the code.



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

from statsmodels.tsa.seasonal import seasonal_decompose
from datetime import datetime, timedelta
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from random import gauss
from random import seed
import scipy.stats as stats
import math
import os
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
#from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.arima_model import ARIMA
In [2]:
# Style Dictionary to standarize plotting 
# scheme between different python scripts 
styledict = {'xtick.labelsize':21,
             'xtick.major.size':9,
             'xtick.major.width':1,
             'ytick.labelsize':21,
             'ytick.major.size':9,
             'ytick.major.width':1,
             'legend.framealpha':0.0,
             'legend.fontsize':15,
             'axes.labelsize':22,
             'axes.titlesize':22,
             'axes.linewidth':2,
             'figure.figsize':(12,8),
             'savefig.format':'jpg'}
plt.rcParams.update(**styledict)

# define font style for text
font = {'family': 'serif',
        'color':  'blue',
        'weight': 'normal',
        'size': 12,
        }
In [3]:
# Go to working directory 
os.chdir("/users/sshen/climstats")

Fig. 7.1: Monthly CO2 Keeling Curve¶

In [4]:
#Python code for plotting: Fig. 7.1: The Keeling curve
os.chdir("/users/sshen/climstats")
co2m = pd.read_csv('data/co2_mm_mlo.csv', skiprows = 51)
print(co2m.head(5))
df = pd.DataFrame(co2m)
print(df.shape)
plt.plot(co2m['decimal date'], co2m['average'], color = 'r')
plt.plot(co2m['decimal date'], co2m['interpolated'], 
         color = 'k')
plt.title('Monthly Atmospheric CO\N{SUBSCRIPT TWO} \
at Mauna Loa Observatory', pad = 15)
plt.ylabel('Parts Per Million [ppm]',
          labelpad = 15)
# text inside the figure
plt.text(1958,410, 
      s='Scripps Institution of Oceanography', size = 18)
plt.text(1958,400, 
         s='NOAA Global Monitoring Laboratory', size = 18)
# save figure
plt.savefig("fig0701.eps")
   year  month  decimal date  average  interpolated  trend  ndays  Unnamed: 7
0  1958      3     1958.2027   315.70        314.43     -1  -9.99       -0.99
1  1958      4     1958.2877   317.45        315.16     -1  -9.99       -0.99
2  1958      5     1958.3699   317.51        314.71     -1  -9.99       -0.99
3  1958      6     1958.4548   317.24        315.14     -1  -9.99       -0.99
4  1958      7     1958.5370   315.86        315.18     -1  -9.99       -0.99
(749, 8)

Fig. 7.2: Monthly Atmospheric CO2 at Mauna Loa Observatory¶

In [5]:
os.chdir("/users/sshen/climstats")
co2m = pd.read_csv('data/co2_mm_mlo.csv', skiprows = 51) 
In [6]:
# get date into correct format in order to plot time series: 
date_index = []

# convert decimal date into datetime object, and store
df_co2 = df 
for i in df_co2['decimal date']:
    start = float(i)
    year = int(float(start))
    rem = start - year
    base = datetime(year, 1, 1)
    result = base + timedelta(seconds=(base.replace(year=
           base.year + 1) - base).total_seconds() * rem)
    date_index.append(result)

date_index = pd.to_datetime(date_index)
In [7]:
# Create a time series starting from March 1958 to July 2020 with monthly frequency
from statsmodels.tsa.seasonal import seasonal_decompose
os.chdir("/users/sshen/climstats")
co2m = pd.read_csv('data/co2_mm_mlo.csv', skiprows = 51) 
data_values = ser = pd.Series(co2m['average'])
ser.index = date_index
# get observed data in correct format
# convert values from string to float):
data_values = [float(i) for i in ser]
co2 = ser
co2.index = pd.date_range('1958-03-17', '2020-08-17', freq = 'M')
#pd.date_range(start='1958-03', end='2020-08', freq='M')

# Display the time series
print(co2.head())

# Decompose the time series into trend, seasonal, and residual components
decomposition = sm.tsa.seasonal_decompose(co2, model='additive')

# Plot the decomposed time series
plt.figure(figsize=(11, 16))
plt.subplot(411)
plt.plot(co2)
plt.title(r'Monthly Atmospheric CO$_2$ [ppm] at Mauna Loa Observatory')
plt.ylabel('Observed Data')
plt.subplot(412)
plt.plot(decomposition.trend)
plt.ylabel('Trend')
plt.subplot(413)
plt.plot(decomposition.seasonal)
plt.ylabel('Seasonality')
plt.subplot(414)
plt.plot(decomposition.resid)
plt.ylabel('Random')
plt.tight_layout()
plt.savefig("fig0702.eps")
plt.show()

#you can also use a simple command to make the plots
#decomposition.plot()
#but the figures do not look as nice
1958-03-31    315.70
1958-04-30    317.45
1958-05-31    317.51
1958-06-30    317.24
1958-07-31    315.86
Freq: M, Name: average, dtype: float64

Fig. 7.3: Observed (red) and ETS Forecast (blue)¶

In [8]:
train_series = ser
In [9]:
from statsmodels.tsa.api import SimpleExpSmoothing 
#Use the data "train_series" and "ser" from 
#  the code for Fig. 7.2
# Fit Holt Winter's Exponential Smoothing model, 
# with additive trend and season, then make 
# a forecast
model = ExponentialSmoothing(train_series, seasonal_periods = 12, trend='add', seasonal='add', freq='M')
model_fit = model.fit()
# forecast 48 months
pred = model_fit.forecast(48)

s = seasonal_decompose(ser, model='additive')

# set up plot
fig, ax = plt.subplots()
# labels
plt.title('Observed (red) and ETS Forecast (blue)', pad=15)
plt.ylabel('Parts Per Million [ppm]', labelpad = 15)

# plot observed and forecast lines
plt.plot(s.observed, color='red', linewidth=2)
plt.plot(pred, linewidth = 2)
# save figure
plt.savefig("fig0703.eps")
plt.show()

Fig. 7.4: Daily Tmin at St Paul Station, Minnesota 1941-1950¶

In [10]:
from statsmodels.tsa.seasonal import seasonal_decompose

# Read the data
tda = pd.read_csv('data/StPaulStn.txt', header=None, delim_whitespace=True)

# Extracting Tmin and dates
t1 = tda.iloc[:, 7]  # Assuming the date is in column 8 (0-indexed)
tmin = tda.iloc[:, 11]  # Assuming Tmin is in column 12 (0-indexed)

# Selecting the date range: 1 Jan 1941-31 Dec 1950
print(tda.iloc[1461:1462, :])
#GHCND:USW00014927  ST.  PAUL  AIRPORT  MN  US  19410101
print(tda.iloc[5111:5112, :])
#GHCND:USW00014927  ST.  PAUL  AIRPORT  MN  US  19501213
ta = t1[1461:5111]
da = tmin[1461:5111]

# Convert date string to datetime
ta = pd.to_datetime(ta, format='%Y%m%d')

# Create a time series
tmin_ts = pd.Series(data=da.values, index=ta)

# Decompose the time series
decomposition = sm.tsa.seasonal_decompose(tmin_ts, model='additive', period=365)

#Plot the data and decompostions by a simple command
#decomposition.plot()
#Or using the following for nice figures

# Plot the decomposed time series
plt.figure(figsize=(11, 15))
plt.subplot(411)
plt.plot(decomposition.observed, label='Original')
plt.legend(loc='best')
plt.subplot(412)
plt.plot(decomposition.trend, label='Trend')
plt.legend(loc='best')
plt.subplot(413)
plt.plot(decomposition.seasonal, label='Seasonal')
plt.legend(loc='best')
plt.subplot(414)
plt.plot(decomposition.resid, label='Residual')
plt.legend(loc='best')
plt.tight_layout()
plt.savefig("fig0704.eps") #save the figure
plt.show()
                     0    1     2         3        4   5   6         7     8   \
1461  GHCND:USW00014927  ST.  PAUL  DOWNTOWN  AIRPORT  MN  US  19410101  11.7   

          9    10   11      12  
1461 -9999.0  1.7 -1.1 -9999.0  
                     0    1     2         3        4   5   6         7    8   \
5111  GHCND:USW00014927  ST.  PAUL  DOWNTOWN  AIRPORT  MN  US  19501231  0.0   

          9    10   11      12  
5111 -9999.0  3.9 -6.1 -9999.0  
In [11]:
#Another way: This code is printed in the book
os.chdir("/users/sshen/climstats")
#Read .txt data into Pandas DataFrame
dat = pd.read_fwf('data/StPaulStn.txt', header=None)
print(dat.iloc[1461:1462, :])
#GHCND:USW00014927  ST.  PAUL  AIRPORT  MN  US  19410101
print(dat.iloc[5111:5112, :])
#GHCND:USW00014927  ST.  PAUL  AIRPORT  MN  US  19501213
print(dat.shape)
Tmin = dat[1461:5111][11] #Extract Tmin data from column 12
Tmin.index = dat.iloc[1461:5111, 7]
# Convert date string to datetime
ta = pd.to_datetime(ta, format='%Y%m%d')
# Create a time series
tmin_ts = pd.Series(data=da.values, index=ta)
ets = sm.tsa.seasonal_decompose(tmin_ts, model='additive', period=365)
ets.plot() #plot the deomposition
plt.savefig("fig0704.eps") #save the figure
plt.show()
                     0    1     2         3        4   5   6         7     8   \
1461  GHCND:USW00014927  ST.  PAUL  DOWNTOWN  AIRPORT  MN  US  19410101  11.7   

          9    10   11      12  
1461 -9999.0  1.7 -1.1 -9999.0  
                     0    1     2         3        4   5   6         7    8   \
5111  GHCND:USW00014927  ST.  PAUL  DOWNTOWN  AIRPORT  MN  US  19501231  0.0   

          9    10   11      12  
5111 -9999.0  3.9 -6.1 -9999.0  
(7589, 13)

Fig. 7.5: A Simulated Time Series of White Noise¶

In [12]:
seed(1)# set seed to ensure the same simulation result
# create the white noise series
white_noise = [gauss(0.0, 1.0) for i in range(500)]
white_noise_ser = pd.Series(white_noise)
# set up plot
fig, ax = plt.subplots(figsize=(12,6))
# white noise
plt.plot(white_noise_ser, 'k')
#plt.plot(white_noise_ser, color='black')
# add labels
plt.title('A Simulated Time Series of White Noise',
         pad = 15)
plt.ylabel(r'$W_{t}$', labelpad = 15)
plt.xlabel('Discrete Time Steps', labelpad = 15)
plt.savefig("fig0705.eps") # save figure
plt.show()

Fig. 7.6: Histogram and ACF of White Noise¶

In [13]:
#set up the plot layout: two panels in a row
fig, ax = plt.subplots(1,2, figsize = (12, 5))
#histogram set up
ax[0].set_xlim(-4,4)
ax[0].set_ylim(0,.5)
ax[0].set_title('Histogram of a White Noise Time Series', 
                pad = 20, size = 18)
ax[0].set_xlabel(r'$W_{t}$', labelpad = 15, size = 20)
ax[0].set_ylabel('Density', labelpad = 15, size = 20)

#Plot the histogram of white noise
ax[0].hist(white_noise_ser, density=True, 
           edgecolor='black', fc='None')
#Fit the histogram to the standard normal distribution
mu = 0
variance = 1
sigma = math.sqrt(variance)
x = np.linspace(-4,4,100)
ax[0].plot(x, stats.norm.pdf(x, mu, sigma), color='blue')

#Plot ACF
sm.graphics.tsa.plot_acf(white_noise_ser, 
                         ax=ax[1], color = 'k')
#Add labels and ticks for ACF
ax[1].set_title('Auto-correlation of White Noise',
                pad = 20, size = 20)
ax[1].set_xlabel('Lag', labelpad = 15, size = 20)
ax[1].set_ylabel('ACF', labelpad = 15, size = 20)
ax[1].set_xticks([0,5,10,15,20,25])

plt.tight_layout()
plt.savefig("fig0706.pdf") # save figure
plt.show()

Fig. 7.7: Random Walk Time Series¶

In [14]:
#Python plot Fig. 7.7(a): Random walk time series
#set seed to ensure the same simulation result
seed(100)
# Fig. 7.7(a): Five realizations with different sigma values
n = 1000  #Total number of time steps
m = 5  #Number of time series realizations
drift = 0.05  #Drift delta = 0.05
sd = [0.1, 0.4, 0.7, 1.0, 1.5]  #SD sigma

#generate the random walk time series data
X = np.zeros((m,n))
for i in range(m):
    white_noise = [gauss(0.0, sd[i]) for k in range(n)]
    
    for j in range(n-1):
        X[i, j+1] = drift + X[i,j] + white_noise[j]

# plot the five time series realizations
seed(100) #set seed
fig, ax = plt.subplots() #set up plot
#plot five random walks
plt.plot(X[0,:], color='black', linewidth=2, 
         label=r'$\sigma$ = 0.1')
plt.plot(X[1,:], color='blue', linewidth=2, 
         label=r'$\sigma$ = 0.4')
plt.plot(X[2,:], color='red', linewidth=2, 
         label=r'$\sigma$ = 0.7')
plt.plot(X[3,:], color='orange', linewidth=2, 
         label=r'$\sigma$ = 1.0')
plt.plot(X[4,:], color='brown', linewidth=2, 
         label=r'$\sigma$ = 1.5')
#add legend
fig.legend(loc=(.15,.65), fontsize = 18)
blue_series = X[1,:]
#add labels
plt.title(r'Five RW realizations: $\delta = 0.05$', 
          pad = 15)
plt.ylabel(r'$X_{t}$', labelpad=15)
plt.xlabel('Time steps: t', labelpad=15)
plt.savefig("fig0707a.eps") # save figure
plt.show()        
In [15]:
# Fig. 7.7(b): 100 realizations 
# random walk to show variance increasing with time
seed(100)# set seed
n = 1000  #Total number of time steps
m = 100  #Number of time series realizations
drift = 0  #Drift delta = 0
sd = np.ones(m)  #SD sigma is the same for all realizations
#Generate the random walk time series data
X = np.zeros((m,n))
for i in range(m):
    white_noise = [gauss(0.0, sd[i]) for k in range(n)]
    
    for j in range(n-1):
        X[i, j+1] = drift + X[i,j] + white_noise[j]
        
# plot the time series realizations
seed(100)# set seed
# set up plot
fig, ax = plt.subplots()
# random walks
for i in range(m):
    plt.plot(X[i,:], color='black', linewidth=1.5)
    
#sd of cols of X
col_sd = np.std(X,axis=0)
plt.plot(col_sd, color='red', linewidth=2, 
  label='Standard deviation of the simulations: sd(t)')
plt.plot(-col_sd, color='red', linewidth=2)
#theoretical sd
theo_sd = np.sqrt(np.arange(1,n+1))
plt.plot(theo_sd, color='blue', linewidth=2, 
         label=r'Theoretical formula: SD=$\sqrt{t}$')
plt.plot(-theo_sd, color='blue', linewidth=2)
#Add legend
fig.legend(loc=(.16,.79), fontsize = 18)

#Add labels
plt.title(r'100 RW Realizations: $\delta = 0, \sigma = 1$',
          pad=15)
plt.ylabel(r'$X_{t}$', labelpad=10)
plt.xlabel('Time steps: t', labelpad=15)
ax.set_ylim(-100,110)
ax.set_yticks([-100,-50,0,50,100])

plt.savefig("fig0708b.eps") # save figure
plt.show()       

7.4.3: Test for Stationarity¶

In [16]:
# Stationarity test for the blue time series in Fig. 7.7:
adf_test = adfuller(blue_series, autolag='AIC')
print('p-value = ', adf_test[1])
#p-value =  0.5653243650129005
# note: the p-value is large, 
# meaning that we keep the null hypothesis of the ADF test: 
# the blue time series is non-stationary.
p-value =  0.5653243650129005
In [17]:
#Stationarity test for white noise:
adf_test = adfuller(white_noise, autolag='AIC')
print('p-value = ', adf_test[1])
#p-value =  0.0
# note: the p-value is small, 
# meaning that we reject the null hypothesis of the ADF test: 
# the white noise time series is stationary.
p-value =  0.0

Fig. 7.8: Moving Average Time Series¶

In [18]:
seed(101)# set seed
n = 124
# white noise
w = [gauss(0,1) for k in range(n)]  
ma1_uniform = np.zeros(n)
ma1_uneven = np.zeros(n)
ma5_uniform = np.zeros(n)
#generate three MA time series
for t in range(4,n):
    ma1_uniform[t] = .5*w[t] + .5*w[t-1]
    ma1_uneven[t] = .9*w[t] + .1*w[t-1]
    ma5_uniform[t] = (w[t] + w[t-1] + w[t-2] 
                      + w[t-3] + w[t-4]) / 5
In [19]:
# plot the time series realizations
seed(101)# set seed
# set up plot
fig, ax = plt.subplots(figsize = (12,6))
plt.plot(ma1_uniform[4:n], color='black',
         label='MA(1): a = 0.5, b = 0.5')
plt.plot(ma1_uneven[4:n], color='blue',
         label='MA(1): a = 0.9, b = 0.1')
plt.plot(ma5_uniform[4:n], color='red', 
         label='MA(5): Uniform weight = 1/5')
# add legends
fig.legend(loc=(.16,.71), prop={"size":16})
# add labels
plt.title('Realizations of MA time series',
         pad = 15)
plt.ylabel(r'$X_{t}$', labelpad=15)
plt.xlabel('Time steps', labelpad=15)
ax.set_ylim(-2,4)
plt.savefig("fig0708.eps")# save figure
plt.show()

Fig. 7.9: Autoregressive Time Series AR(1)¶

In [20]:
seed(791)# set seed
n = 121  # total number of time steps
m = 4  # number of time series realizations
lam = .9  # decay parameter
X = np.zeros((m,n))  # matrix for data
X[:,0] = 4 #set the initial condition at t=0
#generate the time series data
for i in range(m):
    for j in range(n-1):
        X[i,j+1] = X[i,j]*lam + gauss(0,.25)
In [21]:
# plot the realizations and their mean
fig, ax = plt.subplots(figsize = (12,6)) # set up plot
# moving average time series
plt.plot(X[0,:], color='purple', linewidth=2)
plt.plot(X[1,:], color='blue', linewidth=2)
plt.plot(X[2,:], color='red', linewidth=2)
plt.plot(X[3,:], color='orange', linewidth=2)
#plot the mean of above series
col_means = np.mean(X, axis=0)
plt.plot(col_means, color='black', linewidth=2)
#add labels
plt.title('Realizations of AR(1) time series and their mean',
         pad = 15)
plt.ylabel(r'$X_{t}$', labelpad=15)
plt.xlabel('Time steps', labelpad=15)
ax.set_ylim(-2,4)
plt.savefig("fig0709.eps")# save figure
plt.show()

Fig. 7.10: ACF of AR(1)¶

In [22]:
seed(82) #set seed to ensure the same result
n = 481  # total number of time steps
m = 2  # number of time series realizations
lam = [0.9, 0.6]  # decay parameter
X = np.zeros((m,n))  # placeholder for data matrix 
X[:,0] = 4

# simulate the time series data
for k in range(m):
    for i in range(n-1):
        X[k,i+1] = X[k,i]*lam[k] + gauss(0,0.25)
In [23]:
# plot the auto-correlation function
fig, ax = plt.subplots(1,2, figsize = (14,6))
# ACF
sm.graphics.tsa.plot_acf(X[0,:], ax=ax[0], 
                         lags=36, color = 'k')
sm.graphics.tsa.plot_acf(X[1,:], ax=ax[1], lags=36, 
     vlines_kwargs={'color':'red'}, color = 'red')

#add labels
ax[0].set_title('Auto−correlation function of AR(1)', 
                pad = 15)
ax[0].set_xlabel('Time lag', labelpad=15)
ax[0].set_ylabel('ACF', labelpad = -5)
ax[0].set_xticks([0,5,15,25,35])
ax[0].set_yticks([-0.2, 0.2, 0.6,1.0])
ax[0].annotate(r'Decay parameter $\lambda$ = 0.9', 
               xy=(5,.8), size = 22)
ax[1].set_title('Auto−correlation function of AR(1)', 
                pad = 15)
ax[1].set_xlabel('Time lag', labelpad=15)
ax[1].set_ylabel('ACF', labelpad=10)
ax[1].set_xticks([0,5,15,25,35])
ax[1].set_yticks([0, 0.4,0.8])
ax[1].annotate(r'Decay parameter $\lambda$ = 0.6',
               xy=(5,.8), color='red', size = 22)
plt.tight_layout()
plt.savefig("fig0710.pdf")# save figure
plt.show()

Fig. 7.11: ARIMA model fitting¶

In [24]:
## NOTE: Run the code for Figs. 7.1 and 7.2 first ##
In [25]:
from statsmodels.tsa.arima.model import ARIMA
# set up series data variable
data_values = [float(i) for i in ser]
data_dates = pd.date_range('1958-03-17', 
                  '2020-08-17', freq = 'M')
co2_series = pd.Series(data_values, 
                       index=data_dates)

# Fit AR(1) model 
AR1_model = ARIMA(co2_series, order=(1,0,0))
AR1_model_fit = AR1_model.fit()
AR1_residuals = AR1_model_fit.resid
AR1_fit = pd.Series(co2_series.values - 
              AR1_residuals.values, data_dates)

# Fit MA(1) model
MA1_model = ARIMA(co2_series, order=(0,0,1))
MA1_model_fit = MA1_model.fit()
MA1_residuals = MA1_model_fit.resid
MA1_fit = pd.Series(co2_series.values - 
              MA1_residuals.values, data_dates)
In [26]:
# drop first values of models
AR1_fit.drop(AR1_fit.index[0], inplace=True)
MA1_fit.drop(MA1_fit.index[0], inplace=True)
In [27]:
fig, ax = plt.subplots()#plot setup
#prepare labels
plt.title(r'Monthly Atmospheric CO$_2$ at Mauna Loa Observatory',
         pad = 15)
plt.ylabel('Parts Per Million [ppm]', labelpad = 15)
#plot observed line
plt.plot(ser, color='red', linewidth=2, 
         linestyle='-', label='Observed data')
#plot AR(1) model
plt.plot(AR1_fit, color='black', linestyle='--',
         linewidth=1, label='AR(1) model fitting')
#plot MA(1) model
plt.plot(MA1_fit, color='blue', linestyle='--', 
         linewidth=2, label='MA(1) model fitting')
#add legend
plt1 = plt.legend(loc=(.10,.7), fontsize = 18)
plt1.get_texts()[0].set_color('r')
plt1.get_texts()[2].set_color('b')
plt.savefig("fig0711.pdf")# save figure
plt.show()