Lab 8-1: SVD with Monthly Precipitation#
Derived from svd_tutorial.m (data decomposition tutorial using SVD in MATLAB)
by Brian Henn, UW, October 2013. Updated by Jessica Lundquist, October 2014. Converted to python by Steven Pestana, November 2019
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
%matplotlib inline
# we'll use scipy's IO library and datetime to read .mat files
import scipy.io as sio
import datetime as dt
# SVD function from scipy
from scipy.linalg import svd
Read in PRISM file#
NOTE: on the JupyterHub the following file is not available, meaning that we’ll see a “FileNotFoundError”
You will need to download this file from the course Canvas page to run this notebook.
data = sio.loadmat('PRISM_4km_1982-2012.mat')
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
/opt/hostedtoolcache/Python/3.7.17/x64/lib/python3.7/site-packages/scipy/io/matlab/mio.py in _open_file(file_like, appendmat, mode)
38 try:
---> 39 return open(file_like, mode), True
40 except IOError as e:
FileNotFoundError: [Errno 2] No such file or directory: 'PRISM_4km_1982-2012.mat'
During handling of the above exception, another exception occurred:
FileNotFoundError Traceback (most recent call last)
/tmp/ipykernel_2906/1817629218.py in <module>
----> 1 data = sio.loadmat('PRISM_4km_1982-2012.mat')
/opt/hostedtoolcache/Python/3.7.17/x64/lib/python3.7/site-packages/scipy/io/matlab/mio.py in loadmat(file_name, mdict, appendmat, **kwargs)
222 """
223 variable_names = kwargs.pop('variable_names', None)
--> 224 with _open_file_context(file_name, appendmat) as f:
225 MR, _ = mat_reader_factory(f, **kwargs)
226 matfile_dict = MR.get_variables(variable_names)
/opt/hostedtoolcache/Python/3.7.17/x64/lib/python3.7/contextlib.py in __enter__(self)
110 del self.args, self.kwds, self.func
111 try:
--> 112 return next(self.gen)
113 except StopIteration:
114 raise RuntimeError("generator didn't yield") from None
/opt/hostedtoolcache/Python/3.7.17/x64/lib/python3.7/site-packages/scipy/io/matlab/mio.py in _open_file_context(file_like, appendmat, mode)
15 @contextmanager
16 def _open_file_context(file_like, appendmat, mode='rb'):
---> 17 f, opened = _open_file(file_like, appendmat, mode)
18 try:
19 yield f
/opt/hostedtoolcache/Python/3.7.17/x64/lib/python3.7/site-packages/scipy/io/matlab/mio.py in _open_file(file_like, appendmat, mode)
43 if appendmat and not file_like.endswith('.mat'):
44 file_like += '.mat'
---> 45 return open(file_like, mode), True
46 else:
47 raise IOError(
FileNotFoundError: [Errno 2] No such file or directory: 'PRISM_4km_1982-2012.mat'
# Inspect the dictionary keys
print(data.keys())
dict_keys(['__header__', '__version__', '__globals__', 'Zs', 'dates', 'xs', 'ys', 'Xs', 'Ys', 'ppt_mean', 'ppt', 'map_index', 'ny', 'nx', 'nm', 'n', 'ppt_stdev'])
Unpack the mat file into numpy arrays, format dates to python datetimes following the method outlined here.
# convert matlab format dates to python datetimes
datenums = data['dates'][:,0]
dates = [dt.datetime.fromordinal(int(d)) + dt.timedelta(days=int(d)%1) - dt.timedelta(days = 366) for d in datenums]
# Unpack the rest of the data
Zs = data['Zs'] #elevation
xs = data['xs'] # x coordinate
ys = data['ys'] # y coordinate
Xs = data['Xs'] # longitude on a grid
Ys = data['Ys'] # latitude on a grid
ppt_mean = data['ppt_mean']
ppt = data['ppt']
map_index = data['map_index'][:,0] - 1 # map indices need to be shifted by -1 for numpy arrays
ny = data['ny'][0][0] #size in the y dimension
nx = data['nx'][0][0] # size in the x dimension
nm = data['nm'][0][0]
n = data['n'][0][0]
ppt_stdev = data['ppt_stdev']
ppt_mean: gridded array of average monthly rainfall (mm) (lat. x lon.)
ppt: gridded array of monthly rainfall anomaly (mm) (lat. x lon.) (ppt has a mean of 0)
ppt_stdev: gridded array of the standard deviation in monthly rainfall by location (mm)
map_index: list of indices for rearranging 1D array into gridded data
I’m also going to define this function, which will help us use the “map_index” to correctly shape our data into 2D maps
def make_map(X, n, nx, ny):
# create an empty np array full of NaN values, that's the correct length for our 2D data
a = np.full(n, np.nan)
# use the map_index to arrange data values from this selected month into array "a"
a[map_index] = X
# reshape "a" into a 2D array of the correct shape
b = a.reshape([nx, ny]).T
# return our 2D map
return b
Plot Elevation Data#
plt.figure(figsize=(3,7))
plt.contourf(Xs,Ys,Zs, levels=np.linspace(0,4000,500), cmap='terrain', vmin=-1000)
plt.colorbar()
plt.title('Elevation (m)')
plt.xlabel('longitude')
plt.ylabel('latitude')
Text(0, 0.5, 'latitude')
Plot Mean Precip. Data#
plt.figure(figsize=(3,7))
plt.contourf(Xs,Ys,ppt_mean, levels=np.linspace(0,400), cmap='Blues')
plt.colorbar()
plt.title('Mean Precipitation (mm)');
plt.xlabel('longitude')
plt.ylabel('latitude')
Text(0, 0.5, 'latitude')
Plot some monthly precipitation anomalies#
month = 0 #this index tells us which month in our timeseries we want to plot
print(dates[0]) # print our first date (this data is on a monthly timestep, so it will be our first month)
1981-10-01 00:00:00
This first month in our dataset is October 1981
Plot precipitation anomaly for this month (we are using our “make_map” function below to rearrange the precip data into the correct 2D shape)
plt.figure(figsize=(3,7))
plt.contourf(Xs, Ys, make_map(ppt[:,month], n, nx, ny), levels=np.linspace(-1.5,1.5), cmap='BrBG')
plt.colorbar()
plt.title('Precipitation Anomaly for Oct 1981\n(number of standard deviations)');
plt.xlabel('longitude')
plt.ylabel('latitude')
Text(0, 0.5, 'latitude')
Decompose the entire dataset using SVD#
U, S, V = svd(ppt,full_matrices=False)
Look at the shape of each the outputs
U.shape, S.shape, V.shape
((46718, 372), (372,), (372, 372))
Take a look at the EOFs (U) and PCs (V)
# first index for both U and V
i = 0
# plot first EOF (U)
plt.figure(figsize=(3,7))
plt.contourf(Xs, Ys, make_map(U[:,i], n, nx, ny), levels=np.linspace(-0.02, 0.02), cmap='PuOr')
plt.colorbar()
plt.title('EOF 1 (spatial weights)');
plt.xlabel('longitude')
plt.ylabel('latitude')
# Plot first PC (V)
plt.figure(figsize=(10,3))
plt.plot(dates,V[i,:]);
plt.title('PC 1 (temporal weights)');
plt.xlabel('year')
plt.ylabel('weight')
Text(0, 0.5, 'weight')
Note that you can equivalently plot these each with a -1 multiplier (sometimes that makes it easier to interpret things)
plt.figure(figsize=(3,7))
plt.contourf(Xs,Ys,-1*make_map(U[:,i], n, nx, ny), levels=np.linspace(-0.02, 0.02), cmap='PuOr') # with -1 multiplier
plt.colorbar()
plt.title('EOF 1 (spatial weights)');
plt.xlabel('longitude')
plt.ylabel('latitude')
# Plot first PC
# Note that I'm also going to zoom here so that it's easier to see how things change by month
# over just a few years
i=0
plt.figure(figsize=(10,3))
plt.plot(dates,-1*V[i,:]) # with -1 multiplier
plt.title('PC 1 (temporal weights)')
plt.xlabel('year')
plt.ylabel('weight')
# setting the x limits to the 0th and 48th dates in the dataset
plt.xlim((dates[0], dates[48]));
i = 1
# plot second EOF
plt.figure(figsize=(3,7))
plt.contourf(Xs,Ys,-1*make_map(U[:,i], n, nx, ny), levels=np.linspace(-0.02, 0.02), cmap='PuOr') # with -1 multiplier
plt.colorbar()
plt.title('EOF 2');
plt.xlabel('longitude')
plt.ylabel('latitude')
# Plot second PC
plt.figure(figsize=(10,3))
plt.plot(dates,-1*V[i,:]) # with -1 multiplier
plt.title('PC 2')
plt.xlabel('year')
plt.ylabel('weight');
# Again, try zooming in to some specific dates on the PC
i = 1
plt.figure(figsize=(10,3))
plt.plot(dates,-1*V[i,:]) # with -1 multiplier
plt.title('PC 2 (temporal weights)')
plt.xlabel('year')
plt.ylabel('weight')
plt.xlim((dates[0], dates[48]));
i = 2
# plot third EOF
plt.figure(figsize=(3,7))
plt.contourf(Xs,Ys,-1*make_map(U[:,i], n, nx, ny), levels=np.linspace(-0.02, 0.02), cmap='PuOr') # with -1 multiplier
plt.colorbar()
plt.title('EOF 3');
plt.xlabel('longitude')
plt.ylabel('latitude')
# Plot third PC
plt.figure(figsize=(10,3))
plt.plot(dates,-1*V[i,:]) # with -1 multiplier
plt.title('PC 3')
plt.xlabel('year')
plt.ylabel('weight');
SVD Variance Explained#
Recall that the values in matrix S describes the variance explained by each pattern.
What fraction of the whole dataset is described by the first 10 patterns?
Compute the fraction of total variance explained by dividing by the sum of all variance
# SVD Variance Explained, divide S values by the sum of all S
var_exp = S / np.sum(S)
Plot our fraction of variance explained versus EOF number
plt.figure(figsize=(5,5))
plt.plot(var_exp)
plt.xlabel('EOF number')
plt.ylabel('Variance Explained')
plt.title('SVD Variance Explained');
Set smaller ranges of axes limits to zoom in to the first 10 EOFs
plt.figure(figsize=(5,5))
plt.plot(var_exp,'-o')
plt.xlabel('EOF number')
plt.ylabel('Variance Explained')
plt.title('SVD Variance Explained')
plt.ylim([0,0.16])
plt.xlim([0,10]);
What fraction of the whole dataset is described by the first 10 patterns?
# the first 10:
print(var_exp[0:10])
# sum of the first 10
print('Percent of overall variance expplained by the top 10 patterns = {}%'.format( np.round( 100*np.sum(var_exp[0:10]),1)))
[0.1486109 0.06578349 0.05054685 0.0288071 0.02807555 0.02074685
0.01770611 0.01661285 0.01411301 0.01339134]
Percent of overall variance expplained by the top 10 patterns = 40.4%
Plot 10 leading EOFs#
Note that because they are all plotted in close proximity, we will not put on all the axes labels. In general, please always remember axes labels.
f, ax = plt.subplots(2,5,figsize=(15,10), tight_layout=True)
i = 0
for row in range(2):
for column in range(5):
cf = ax[row,column].contourf(Xs,Ys,-1*make_map(U[:,i], n, nx, ny), levels=np.linspace(-0.02, 0.02), cmap='PuOr') # with -1 multiplier
cbar = plt.colorbar(cf, ax=ax[row,column])
ax[row,column].set_title('EOF {}'.format(i+1))
ax[row,column].set_xlabel('Longitude')
ax[row,column].set_ylabel('Latitude')
i+=1
plt.tight_layout()
# Plot the 10 leading PCs
f, ax = plt.subplots(10,1,figsize=(10,30))
for i in range(10):
ax[i].plot(dates,-1*V[i,:]); # with -1 multiplier
ax[i].set_title('PC {}: {}%'.format(i,np.round(100*var_exp[i]),2));
plt.tight_layout()
Reconstructing Original Data#
Reconstruct and plot October 1981 data with only 10 EOFs/PCs.
We can reconstruct much of the original data using only the first several patterns. This requires much less information than the original data itself.
See documentation and examples here.
# Select the first 10 to reconstruct the precip data
i=9
ppt_reconstructed = np.dot(U[:,:i] * S[:i], V[:i,:])
fig, ax = plt.subplots(1,3,figsize=(10,7))
month = 0
# Plot the original precip data
cf = ax[0].contourf(Xs,Ys,make_map(ppt[:,month], n, nx, ny), levels=np.linspace(-1.5,1.5), cmap='BrBG')
cbar = plt.colorbar(cf, ax=ax[0])
ax[0].set_title('True Oct 1981 \nMean Precip. Anomaly (mm)');
ax[0].set_xlabel('longitude')
ax[0].set_ylabel('latitude')
# Plot the reconstructed precip data
cf = ax[1].contourf(Xs,Ys,make_map(ppt_reconstructed[:,month], n, nx, ny), levels=np.linspace(-1.5,1.5), cmap='BrBG')
cbar = plt.colorbar(cf, ax=ax[1])
ax[1].set_title('Reconstructed Oct 1981 \nMean Precip. Anomaly (mm)');
plt.xlabel('longitude')
plt.ylabel('latitude')
# Plot the difference between the original and reconstructed data
difference = make_map(ppt_reconstructed[:,month], n, nx, ny) - make_map(ppt[:,month], n, nx, ny)
cf = ax[2].contourf(Xs,Ys,difference,levels=np.linspace(-2,2),cmap='RdBu_r')
cbar = plt.colorbar(cf, ax=ax[2])
ax[2].set_title('Reconstructed - True Oct 1981 \nMean Precip. Anomaly Difference (mm)');
plt.xlabel('longitude')
plt.ylabel('latitude')
plt.tight_layout()
fig, ax = plt.subplots(1,3,figsize=(10,7))
month = 0
# Plot the original precip data
cf = ax[0].contourf(Xs,Ys,ppt_mean + make_map(ppt[:,month], n, nx, ny), levels=np.linspace(0,400), cmap='Blues')
cbar = plt.colorbar(cf, ax=ax[0])
ax[0].set_title('True Oct 1981 \nMean Precipitation (mm)');
ax[0].set_xlabel('longitude')
ax[0].set_ylabel('latitude')
# Plot the reconstructed precip data
cf = ax[1].contourf(Xs,Ys,ppt_mean + make_map(ppt_reconstructed[:,month], n, nx, ny), levels=np.linspace(0,400), cmap='Blues')
cbar = plt.colorbar(cf, ax=ax[1])
ax[1].set_title('Reconstructed Oct 1981 \nMean Precipitation (mm)');
plt.xlabel('longitude')
plt.ylabel('latitude')
# Plot the difference between the original and reconstructed data
difference = make_map(ppt_reconstructed[:,month], n, nx, ny) - make_map(ppt[:,month], n, nx, ny)
cf = ax[2].contourf(Xs,Ys,difference,levels=np.linspace(-2,2),cmap='RdBu_r')
cbar = plt.colorbar(cf, ax=ax[2])
ax[2].set_title('Reconstructed - True Oct 1981 \nMean Precip. Anomaly Difference (mm)');
plt.xlabel('longitude')
plt.ylabel('latitude')
plt.tight_layout()