Lab 8-1: SVD with Monthly Precipitation#

Derived from svd_tutorial.m (data decomposition tutorial using SVD in MATLAB)

by Brian Henn, UW, October 2013. Updated by Jessica Lundquist, October 2014. Converted to python by Steven Pestana, November 2019

import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
%matplotlib inline

# we'll use scipy's IO library and datetime to read .mat files
import scipy.io as sio 
import datetime as dt

# SVD function from scipy
from scipy.linalg import svd

Read in PRISM file#

NOTE: on the JupyterHub the following file is not available, meaning that we’ll see a “FileNotFoundError”

You will need to download this file from the course Canvas page to run this notebook.

data = sio.loadmat('PRISM_4km_1982-2012.mat')
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
/opt/hostedtoolcache/Python/3.7.17/x64/lib/python3.7/site-packages/scipy/io/matlab/mio.py in _open_file(file_like, appendmat, mode)
     38     try:
---> 39         return open(file_like, mode), True
     40     except IOError as e:

FileNotFoundError: [Errno 2] No such file or directory: 'PRISM_4km_1982-2012.mat'

During handling of the above exception, another exception occurred:

FileNotFoundError                         Traceback (most recent call last)
/tmp/ipykernel_2906/1817629218.py in <module>
----> 1 data = sio.loadmat('PRISM_4km_1982-2012.mat')

/opt/hostedtoolcache/Python/3.7.17/x64/lib/python3.7/site-packages/scipy/io/matlab/mio.py in loadmat(file_name, mdict, appendmat, **kwargs)
    222     """
    223     variable_names = kwargs.pop('variable_names', None)
--> 224     with _open_file_context(file_name, appendmat) as f:
    225         MR, _ = mat_reader_factory(f, **kwargs)
    226         matfile_dict = MR.get_variables(variable_names)

/opt/hostedtoolcache/Python/3.7.17/x64/lib/python3.7/contextlib.py in __enter__(self)
    110         del self.args, self.kwds, self.func
    111         try:
--> 112             return next(self.gen)
    113         except StopIteration:
    114             raise RuntimeError("generator didn't yield") from None

/opt/hostedtoolcache/Python/3.7.17/x64/lib/python3.7/site-packages/scipy/io/matlab/mio.py in _open_file_context(file_like, appendmat, mode)
     15 @contextmanager
     16 def _open_file_context(file_like, appendmat, mode='rb'):
---> 17     f, opened = _open_file(file_like, appendmat, mode)
     18     try:
     19         yield f

/opt/hostedtoolcache/Python/3.7.17/x64/lib/python3.7/site-packages/scipy/io/matlab/mio.py in _open_file(file_like, appendmat, mode)
     43             if appendmat and not file_like.endswith('.mat'):
     44                 file_like += '.mat'
---> 45             return open(file_like, mode), True
     46         else:
     47             raise IOError(

FileNotFoundError: [Errno 2] No such file or directory: 'PRISM_4km_1982-2012.mat'
# Inspect the dictionary keys
print(data.keys())
dict_keys(['__header__', '__version__', '__globals__', 'Zs', 'dates', 'xs', 'ys', 'Xs', 'Ys', 'ppt_mean', 'ppt', 'map_index', 'ny', 'nx', 'nm', 'n', 'ppt_stdev'])

Unpack the mat file into numpy arrays, format dates to python datetimes following the method outlined here.

# convert matlab format dates to python datetimes
datenums = data['dates'][:,0]
dates = [dt.datetime.fromordinal(int(d)) + dt.timedelta(days=int(d)%1) - dt.timedelta(days = 366) for d in datenums]

# Unpack the rest of the data
Zs = data['Zs'] #elevation
xs = data['xs'] # x coordinate
ys = data['ys'] # y coordinate
Xs = data['Xs'] # longitude on a grid
Ys = data['Ys'] # latitude on a grid
ppt_mean = data['ppt_mean']
ppt = data['ppt']
map_index = data['map_index'][:,0] - 1 # map indices need to be shifted by -1 for numpy arrays
ny = data['ny'][0][0] #size in the y dimension
nx = data['nx'][0][0] # size in the x dimension
nm = data['nm'][0][0]
n = data['n'][0][0]
ppt_stdev = data['ppt_stdev']

ppt_mean: gridded array of average monthly rainfall (mm) (lat. x lon.)

ppt: gridded array of monthly rainfall anomaly (mm) (lat. x lon.) (ppt has a mean of 0)

ppt_stdev: gridded array of the standard deviation in monthly rainfall by location (mm)

map_index: list of indices for rearranging 1D array into gridded data

I’m also going to define this function, which will help us use the “map_index” to correctly shape our data into 2D maps

def make_map(X, n, nx, ny):
    # create an empty np array full of NaN values, that's the correct length for our 2D data
    a = np.full(n, np.nan)
    # use the map_index to arrange data values from this selected month into array "a"
    a[map_index] = X
    # reshape "a" into a 2D array of the correct shape
    b = a.reshape([nx, ny]).T
    # return our 2D map
    return b

Plot Elevation Data#

plt.figure(figsize=(3,7))
plt.contourf(Xs,Ys,Zs, levels=np.linspace(0,4000,500), cmap='terrain', vmin=-1000)
plt.colorbar()
plt.title('Elevation (m)')
plt.xlabel('longitude')
plt.ylabel('latitude')
Text(0, 0.5, 'latitude')
../../_images/14e8bd0065c4174a882c891b69564a11b3a42a31d79dab6bd5994849c2bce2b4.png

Plot Mean Precip. Data#

plt.figure(figsize=(3,7))
plt.contourf(Xs,Ys,ppt_mean, levels=np.linspace(0,400), cmap='Blues')
plt.colorbar()
plt.title('Mean Precipitation (mm)');
plt.xlabel('longitude')
plt.ylabel('latitude')
Text(0, 0.5, 'latitude')
../../_images/e220a8a852c569c5bcf9c89677983eddf9a120712642a70fb3b2b3acf009301f.png

Plot some monthly precipitation anomalies#

month = 0  #this index tells us which month in our timeseries we want to plot
print(dates[0]) # print our first date (this data is on a monthly timestep, so it will be our first month)
1981-10-01 00:00:00

This first month in our dataset is October 1981

Plot precipitation anomaly for this month (we are using our “make_map” function below to rearrange the precip data into the correct 2D shape)

plt.figure(figsize=(3,7))
plt.contourf(Xs, Ys, make_map(ppt[:,month], n, nx, ny), levels=np.linspace(-1.5,1.5), cmap='BrBG')
plt.colorbar()
plt.title('Precipitation Anomaly for Oct 1981\n(number of standard deviations)');
plt.xlabel('longitude')
plt.ylabel('latitude')
Text(0, 0.5, 'latitude')
../../_images/e333b95936041c60fe5653d1ceb96140580f9b877407f58eadde378714b89c8d.png

Decompose the entire dataset using SVD#

U, S, V = svd(ppt,full_matrices=False)

Look at the shape of each the outputs

U.shape, S.shape, V.shape
((46718, 372), (372,), (372, 372))

Take a look at the EOFs (U) and PCs (V)

# first index for both U and V
i = 0

# plot first EOF (U)
plt.figure(figsize=(3,7))
plt.contourf(Xs, Ys, make_map(U[:,i], n, nx, ny), levels=np.linspace(-0.02, 0.02), cmap='PuOr')
plt.colorbar()
plt.title('EOF 1 (spatial weights)');
plt.xlabel('longitude')
plt.ylabel('latitude')

# Plot first PC (V)
plt.figure(figsize=(10,3))
plt.plot(dates,V[i,:]);
plt.title('PC 1 (temporal weights)');
plt.xlabel('year')
plt.ylabel('weight')
Text(0, 0.5, 'weight')
../../_images/1bfebfbab5580d2ae748729c7a9514cee77c80be2c7048164803d3772c9e230b.png ../../_images/8d1f04d6d4bb75931800e64e244be6eac0460970d649d4f39ec557d1f7af51e2.png

Note that you can equivalently plot these each with a -1 multiplier (sometimes that makes it easier to interpret things)

plt.figure(figsize=(3,7))
plt.contourf(Xs,Ys,-1*make_map(U[:,i], n, nx, ny), levels=np.linspace(-0.02, 0.02), cmap='PuOr') # with -1 multiplier
plt.colorbar()
plt.title('EOF 1 (spatial weights)');
plt.xlabel('longitude')
plt.ylabel('latitude')

# Plot first PC
# Note that I'm also going to zoom here so that it's easier to see how things change by month
# over just a few years
i=0
plt.figure(figsize=(10,3))
plt.plot(dates,-1*V[i,:])  # with -1 multiplier
plt.title('PC 1 (temporal weights)')
plt.xlabel('year')
plt.ylabel('weight')
# setting the x limits to the 0th and 48th dates in the dataset
plt.xlim((dates[0], dates[48]));
../../_images/6a3d4d02621b2856bc21362cd990589b320a08e885a088a546b8f71433ebe72a.png ../../_images/8cc35fc28e905d0a2ada3244899aaf3ba6403db52d8df1a61f6f230cdf6db19b.png
i = 1
# plot second EOF
plt.figure(figsize=(3,7))
plt.contourf(Xs,Ys,-1*make_map(U[:,i], n, nx, ny), levels=np.linspace(-0.02, 0.02), cmap='PuOr')  # with -1 multiplier
plt.colorbar()
plt.title('EOF 2');
plt.xlabel('longitude')
plt.ylabel('latitude')

# Plot second PC
plt.figure(figsize=(10,3))
plt.plot(dates,-1*V[i,:])   # with -1 multiplier
plt.title('PC 2')
plt.xlabel('year')
plt.ylabel('weight');
../../_images/4237e873b27af8b28bc8b16660eb121d04ee8a8a26517bed92e6f7a15c0bc9e9.png ../../_images/c60f5009c9f26d87ab3ab554405b9e6c446c0022b2f39b8836ed2d9be89303a9.png
# Again, try zooming in to some specific dates on the PC
i = 1
plt.figure(figsize=(10,3))
plt.plot(dates,-1*V[i,:])  # with -1 multiplier
plt.title('PC 2 (temporal weights)')
plt.xlabel('year')
plt.ylabel('weight')

plt.xlim((dates[0], dates[48]));
../../_images/7ceb1b3de55c56186a81b7df3cd56abf6eebd3c0780968c5c63930562547547c.png
i = 2

# plot third EOF
plt.figure(figsize=(3,7))
plt.contourf(Xs,Ys,-1*make_map(U[:,i], n, nx, ny), levels=np.linspace(-0.02, 0.02), cmap='PuOr')  # with -1 multiplier
plt.colorbar()
plt.title('EOF 3');
plt.xlabel('longitude')
plt.ylabel('latitude')

# Plot third PC
plt.figure(figsize=(10,3))
plt.plot(dates,-1*V[i,:])  # with -1 multiplier
plt.title('PC 3')
plt.xlabel('year')
plt.ylabel('weight');
../../_images/b3ada0da71335d7d79fc17fe104515ee91cd0478d741054de19c927a3a250476.png ../../_images/7b366e1a1adf6290039e44b6198051a6e40e937bc1f4645ba8e943f01b8eeda6.png

SVD Variance Explained#

Recall that the values in matrix S describes the variance explained by each pattern.

  • What fraction of the whole dataset is described by the first 10 patterns?

Compute the fraction of total variance explained by dividing by the sum of all variance

# SVD Variance Explained, divide S values by the sum of all S
var_exp = S / np.sum(S)

Plot our fraction of variance explained versus EOF number

plt.figure(figsize=(5,5))
plt.plot(var_exp)
plt.xlabel('EOF number')
plt.ylabel('Variance Explained')
plt.title('SVD Variance Explained');
../../_images/74214522e22a1c6f8b101f9af4398b1abae796be1a7f9d926b8d6fecce80331f.png

Set smaller ranges of axes limits to zoom in to the first 10 EOFs

plt.figure(figsize=(5,5))
plt.plot(var_exp,'-o')
plt.xlabel('EOF number')
plt.ylabel('Variance Explained')
plt.title('SVD Variance Explained')
plt.ylim([0,0.16])
plt.xlim([0,10]);
../../_images/75d77893ea532ddcffc647caff861a2eaea86974de18781b23950a1573ae93d6.png
  • What fraction of the whole dataset is described by the first 10 patterns?

# the first 10:
print(var_exp[0:10])

# sum of the first 10
print('Percent of overall variance expplained by the top 10 patterns = {}%'.format( np.round( 100*np.sum(var_exp[0:10]),1)))
[0.1486109  0.06578349 0.05054685 0.0288071  0.02807555 0.02074685
 0.01770611 0.01661285 0.01411301 0.01339134]
Percent of overall variance expplained by the top 10 patterns = 40.4%

Plot 10 leading EOFs#

Note that because they are all plotted in close proximity, we will not put on all the axes labels. In general, please always remember axes labels.

f, ax = plt.subplots(2,5,figsize=(15,10), tight_layout=True)
i = 0
for row in range(2):
    for column in range(5):    
      
        cf = ax[row,column].contourf(Xs,Ys,-1*make_map(U[:,i], n, nx, ny), levels=np.linspace(-0.02, 0.02), cmap='PuOr') # with -1 multiplier
        cbar = plt.colorbar(cf, ax=ax[row,column])
        ax[row,column].set_title('EOF {}'.format(i+1))
        ax[row,column].set_xlabel('Longitude')
        ax[row,column].set_ylabel('Latitude')
        i+=1

plt.tight_layout()
../../_images/9d3172a67933a2f0b6d0703f5d374eeba89a9e5700a5fde96f83e2d738d38809.png
# Plot the 10 leading PCs
f, ax = plt.subplots(10,1,figsize=(10,30))
for i in range(10):
    ax[i].plot(dates,-1*V[i,:]); # with -1 multiplier
    ax[i].set_title('PC {}: {}%'.format(i,np.round(100*var_exp[i]),2));
plt.tight_layout()
../../_images/789f58247e940e3f2a2de09aeb08bd492a4d3347992cdb087a2559da15ebfbf7.png

Reconstructing Original Data#

Reconstruct and plot October 1981 data with only 10 EOFs/PCs.

We can reconstruct much of the original data using only the first several patterns. This requires much less information than the original data itself.

See documentation and examples here.

# Select the first 10 to reconstruct the precip data
i=9
ppt_reconstructed = np.dot(U[:,:i] * S[:i], V[:i,:])
fig, ax = plt.subplots(1,3,figsize=(10,7))

month = 0

# Plot the original precip data
cf = ax[0].contourf(Xs,Ys,make_map(ppt[:,month], n, nx, ny), levels=np.linspace(-1.5,1.5), cmap='BrBG')
cbar = plt.colorbar(cf, ax=ax[0])
ax[0].set_title('True Oct 1981 \nMean Precip. Anomaly (mm)');
ax[0].set_xlabel('longitude')
ax[0].set_ylabel('latitude')

# Plot the reconstructed precip data
cf = ax[1].contourf(Xs,Ys,make_map(ppt_reconstructed[:,month], n, nx, ny), levels=np.linspace(-1.5,1.5), cmap='BrBG')
cbar = plt.colorbar(cf, ax=ax[1])
ax[1].set_title('Reconstructed Oct 1981 \nMean Precip. Anomaly (mm)');
plt.xlabel('longitude')
plt.ylabel('latitude')

# Plot the difference between the original and reconstructed data
difference = make_map(ppt_reconstructed[:,month], n, nx, ny) - make_map(ppt[:,month], n, nx, ny)
cf = ax[2].contourf(Xs,Ys,difference,levels=np.linspace(-2,2),cmap='RdBu_r')
cbar = plt.colorbar(cf, ax=ax[2])
ax[2].set_title('Reconstructed - True Oct 1981 \nMean Precip. Anomaly Difference (mm)');
plt.xlabel('longitude')
plt.ylabel('latitude')

plt.tight_layout()
../../_images/ff6febbf02f8c27cdddf8308ed674e7fbe60146bdf60bc8fb9b7a6696687c8fb.png
fig, ax = plt.subplots(1,3,figsize=(10,7))

month = 0

# Plot the original precip data
cf = ax[0].contourf(Xs,Ys,ppt_mean + make_map(ppt[:,month], n, nx, ny), levels=np.linspace(0,400), cmap='Blues')
cbar = plt.colorbar(cf, ax=ax[0])
ax[0].set_title('True Oct 1981 \nMean Precipitation (mm)');
ax[0].set_xlabel('longitude')
ax[0].set_ylabel('latitude')

# Plot the reconstructed precip data
cf = ax[1].contourf(Xs,Ys,ppt_mean + make_map(ppt_reconstructed[:,month], n, nx, ny), levels=np.linspace(0,400), cmap='Blues')
cbar = plt.colorbar(cf, ax=ax[1])
ax[1].set_title('Reconstructed Oct 1981 \nMean Precipitation (mm)');
plt.xlabel('longitude')
plt.ylabel('latitude')

# Plot the difference between the original and reconstructed data
difference = make_map(ppt_reconstructed[:,month], n, nx, ny) - make_map(ppt[:,month], n, nx, ny)
cf = ax[2].contourf(Xs,Ys,difference,levels=np.linspace(-2,2),cmap='RdBu_r')
cbar = plt.colorbar(cf, ax=ax[2])
ax[2].set_title('Reconstructed - True Oct 1981 \nMean Precip. Anomaly Difference (mm)');
plt.xlabel('longitude')
plt.ylabel('latitude')

plt.tight_layout()
../../_images/a6172a8a8cdcd5fe4ddfb52afdc921f944280bf47f14a19983caf04324ec9811.png