Source code for pyunicorn.climate.mutual_info
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# This file is part of pyunicorn.
# Copyright (C) 2008--2015 Jonathan F. Donges and pyunicorn authors
# URL: <http://www.pik-potsdam.de/members/donges/software>
# License: BSD (3-clause)
"""
Provides classes for generating and analyzing complex climate networks.
"""
#
# Import essential packages
#
# array object and fast numerics
import numpy as np
# Import progress bar for easy progress bar handling
from ..utils import progressbar
# C++ inline code
from .. import weave_inline
# Import cnNetwork for Network base class
from .climate_network import ClimateNetwork
LONG_TYPE = np.int64
DOUBLE_TYPE = np.float32
#
# Define class MutualInfoClimateNetwork
#
[docs]class MutualInfoClimateNetwork(ClimateNetwork):
"""
Represents a mutual information climate network.
Constructs a static climate network based on mutual information at zero
lag, as in [Ueoka2008]_.
Mutual information climate networks are undirected, since mutual
information is a symmetrical measure. In contrast to Pearson correlation
used in :class:`.TsonisClimateNetwork`, mutual information has the
potential to detect nonlinear statistical interdependencies.
"""
#
# Defines internal methods
#
[docs] def __init__(self, data, threshold=None, link_density=None,
non_local=False, node_weight_type="surface", winter_only=True,
silence_level=0):
"""
Initialize an instance of MutualInfoClimateNework.
.. note::
Either threshold **OR** link_density have to be given!
Possible choices for ``node_weight_type``:
- None (constant unit weights)
- "surface" (cos lat)
- "irrigation" (cos**2 lat)
:type data: :class:`.ClimateData`
:arg data: The climate data used for network construction.
:arg float threshold: The threshold of similarity measure, above which
two nodes are linked in the network.
:arg float link_density: The networks's desired link density.
:arg bool non_local: Determines, whether links between spatially close
nodes should be suppressed.
:arg str node_weight_type: The type of geographical node weight to be
used.
:arg bool winter_only: Determines, whether only data points from the
winter months (December, January and February) should be used for
analysis. Possibly, this further suppresses the annual cycle in the
time series.
:arg int silence_level: The inverse level of verbosity of the object.
"""
if silence_level <= 1:
print "Generating a mutual information climate network..."
self.silence_level = silence_level
# Set instance variables
self.data = data
"""(ClimateData) - The climate data used for network construction."""
self.N = self.data.grid.N
self._prescribed_link_density = link_density
self._winter_only = winter_only
# Class specific settings
self.mi_file = "mutual_information_" + data.data_source + "_" \
+ data.observable_name + ".data"
"""(string) - The name of the file for storing the mutual information
matrix."""
self._set_winter_only(winter_only)
ClimateNetwork.__init__(self, grid=self.data.grid,
similarity_measure=self._similarity_measure,
threshold=threshold,
non_local=non_local,
directed=False,
node_weight_type=node_weight_type,
silence_level=silence_level)
[docs] def __str__(self):
"""
Return a string representation of MutualInfoClimateNetwork.
"""
return 'MutualInfoClimateNetwork:\n' + ClimateNetwork.__str__(self)
#
# Defines methods to calculate the mutual information matrix
#
[docs] def eval_weave_calculate_mutual_information(self, anomaly):
"""
Compare the fast and slow weave code to calculate mutual information.
:type anomaly: 2D Numpy array (time, index)
:arg anomaly: The anomaly time series.
:rtype: tuple of two 2D Numpy arrays (index, index)
:return: the mutual information matrices from fast and slow algorithm.
"""
mi_fast = self._weave_calculate_mutual_information(anomaly, fast=True)
mi_slow = self._weave_calculate_mutual_information(anomaly, fast=False)
return (mi_fast, mi_slow)
[docs] def _weave_calculate_mutual_information(self, anomaly, n_bins=32,
fast=True):
"""
Calculate the mutual information matrix at zero lag.
The weave code is adopted from the Tisean 3.0.1 mutual.c module.
:type anomaly: 2D Numpy array (time, index)
:arg anomaly: The anomaly time series.
:arg int n_bins: The number of bins for estimating probability
distributions.
:arg bool fast: Indicates, whether fast or slow algorithm should be
used.
:rtype: 2D array (index, index)
:return: the mutual information matrix at zero lag.
"""
if self.silence_level <= 1:
print "Calculating mutual information matrix at zero lag from \
anomaly values using Weave..."
# Normalize anomaly time series to zero mean and unit variance
self.data.normalize_time_series_array(anomaly)
# Create local transposed copy of anomaly
anomaly = np.fastCopyAndTranspose(anomaly)
(N, n_samples) = anomaly.shape
# Get common range for all histograms
range_min = float(anomaly.min())
range_max = float(anomaly.max())
# Rescale all time series to the interval [0,1],
# using the maximum range of the whole dataset.
scaling = float(1. / (range_max - range_min))
# Create array to hold symbolic trajectories
symbolic = np.empty(anomaly.shape, dtype=LONG_TYPE)
# Initialize array to hold 1d-histograms of individual time series
hist = np.zeros((N, n_bins), dtype=LONG_TYPE)
# Initialize array to hold 2d-histogram for one pair of time series
hist2d = np.zeros((n_bins, n_bins), dtype=LONG_TYPE)
# Initialize mutual information array
mi = np.zeros((N, N), dtype=DOUBLE_TYPE)
code = r"""
int i, j, k, l, m;
int symbol, symbol_i, symbol_j;
double norm, rescaled, hpl, hpm, plm;
// Calculate histogram norm
norm = 1.0 / n_samples;
for (i = 0; i < N; i++) {
for (k = 0; k < n_samples; k++) {
// Calculate symbolic trajectories for each time series,
// where the symbols are bins.
rescaled = scaling * (anomaly(i,k) - range_min);
if (rescaled < 1.0) {
symbolic(i,k) = rescaled * n_bins;
}
else {
symbolic(i,k) = n_bins - 1;
}
// Calculate 1d-histograms for single time series
symbol = symbolic(i,k);
hist(i,symbol) += 1;
}
}
for (i = 0; i < N; i++) {
for (j = 0; j <= i; j++) {
// The case i = j is not of interest here!
if (i != j) {
// Calculate 2d-histogram for one pair of time series
// (i,j).
for (k = 0; k < n_samples; k++) {
symbol_i = symbolic(i,k);
symbol_j = symbolic(j,k);
hist2d(symbol_i,symbol_j) += 1;
}
// Calculate mutual information for one pair of time
// series (i,j).
for (l = 0; l < n_bins; l++) {
hpl = hist(i,l) * norm;
if (hpl > 0.0) {
for (m = 0; m < n_bins; m++) {
hpm = hist(j,m) * norm;
if (hpm > 0.0) {
plm = hist2d(l,m) * norm;
if (plm > 0.0) {
mi(i,j) += plm * log(plm/hpm/hpl);
}
}
}
}
}
// Symmetrize MI
mi(j,i) = mi(i,j);
// Reset hist2d to zero in all bins
for (l = 0; l < n_bins; l++) {
for (m = 0; m < n_bins; m++) {
hist2d(l,m) = 0;
}
}
}
}
}
"""
# anomaly must be a contiguous Numpy array for this code to work
# correctly! All the other arrays are generated from scratch in this
# method and are guaranteed to be contiguous by Numpy.
fastCode = r"""
long i, j, k, l, m, in_bins, jn_bins, ln_bins, in_samples, jn_samples,
in_nodes;
double norm, rescaled, hpl, hpm, plm;
double *p_anomaly;
float *p_mi, *p_mi2;
long *p_symbolic, *p_symbolic1, *p_symbolic2, *p_hist, *p_hist1,
*p_hist2, *p_hist2d;
// Calculate histogram norm
norm = 1.0 / n_samples;
// Initialize in_samples, in_bins
in_samples = in_bins = 0;
for (i = 0; i < N; i++) {
// Set pointer to anomaly(i,0)
p_anomaly = anomaly + in_samples;
// Set pointer to symbolic(i,0)
p_symbolic = symbolic + in_samples;
for (k = 0; k < n_samples; k++) {
// Rescale sample into interval [0,1]
rescaled = scaling * (*p_anomaly - range_min);
// Calculate symbolic trajectories for each time series,
// where the symbols are bin numbers.
if (rescaled < 1.0) {
*p_symbolic = rescaled * n_bins;
}
else {
*p_symbolic = n_bins - 1;
}
// Calculate 1d-histograms for single time series
// Set pointer to hist(i, *p_symbolic)
p_hist = hist + in_bins + *p_symbolic;
(*p_hist)++;
// Set pointer to anomaly(k+1,i)
p_anomaly++;
// Set pointer to symbolic(k+1,i)
p_symbolic++;
}
in_samples += n_samples;
in_bins += n_bins;
}
// Initialize in_samples, in_bins, in_nodes
in_samples = in_bins = in_nodes = 0;
for (i = 0; i < N; i++) {
// Set pointer to mi(i,0)
p_mi = mi + in_nodes;
// Set pointer to mi(0,i)
p_mi2 = mi + i;
// Initialize jn_samples, jn_bins
jn_samples = jn_bins = 0;
for (j = 0; j <= i; j++) {
// Don't do anything for i = j, this case is not of
// interest here!
if (i != j) {
// Set pointer to symbolic(i,0)
p_symbolic1 = symbolic + in_samples;
// Set pointer to symbolic(j,0)
p_symbolic2 = symbolic + jn_samples;
// Calculate 2d-histogram for one pair of time series
// (i,j).
for (k = 0; k < n_samples; k++) {
// Set pointer to hist2d(*p_symbolic1, *p_symbolic2)
p_hist2d = hist2d + (*p_symbolic1)*n_bins
+ *p_symbolic2;
(*p_hist2d)++;
// Set pointer to symbolic(i,k+1)
p_symbolic1++;
// Set pointer to symbolic(j,k+1)
p_symbolic2++;
}
// Calculate mutual information for one pair of time
// series (i,j).
// Set pointer to hist(i,0)
p_hist1 = hist + in_bins;
// Initialize ln_bins
ln_bins = 0;
for (l = 0; l < n_bins; l++) {
// Set pointer to hist(j,0)
p_hist2 = hist + jn_bins;
// Set pointer to hist2d(l,0)
p_hist2d = hist2d + ln_bins;
hpl = (*p_hist1) * norm;
if (hpl > 0.0) {
for (m = 0; m < n_bins; m++) {
hpm = (*p_hist2) * norm;
if (hpm > 0.0) {
plm = (*p_hist2d) * norm;
if (plm > 0.0) {
*p_mi += plm * log(plm/hpm/hpl);
}
}
// Set pointer to hist(j,m+1)
p_hist2++;
// Set pointer to hist2d(l,m+1)
p_hist2d++;
}
}
// Set pointer to hist(i,l+1)
p_hist1++;
ln_bins += n_bins;
}
// Symmetrize MI
*p_mi2 = *p_mi;
// Initialize ln_bins
ln_bins = 0;
// Reset hist2d to zero in all bins
for (l = 0; l < n_bins; l++) {
// Set pointer to hist2d(l,0)
p_hist2d = hist2d + ln_bins;
for (m = 0; m < n_bins; m++) {
*p_hist2d = 0;
// Set pointer to hist2d(l,m+1)
p_hist2d++;
}
ln_bins += n_bins;
}
}
// Set pointer to mi(i,j+1)
p_mi++;
// Set pointer to mi(j+1,i)
p_mi2 += N;
jn_samples += n_samples;
jn_bins += n_bins;
}
in_samples += n_samples;
in_bins += n_bins;
in_nodes += N;
}
"""
args = ['anomaly', 'n_samples', 'N', 'n_bins', 'scaling', 'range_min',
'symbolic', 'hist', 'hist2d', 'mi']
if fast:
weave_inline(locals(), fastCode, args, blitz=False)
else:
weave_inline(locals(), code, args)
if self.silence_level <= 1:
print "Done!"
return mi
[docs] def _calculate_mutual_information(self, anomaly, n_bins=32):
"""
Calculate the mutual information matrix at zero lag.
.. note::
Slow since solely based on Python and Numpy!
:type anomaly: 2D array (time, index)
:arg anomaly: The anomaly time series.
:arg int n_bins: The number of bins for estimating probability
distributions.
:rtype: 2D array (index, index)
:return: the mutual information matrix at zero lag.
"""
if self.silence_level <= 1:
print "Calculating mutual information matrix at zero lag from \
anomaly values..."
# Define references to numpy functions for faster function calls
histogram = np.histogram
histogram2d = np.histogram2d
log = np.log
# Normalize anomaly time series to zero mean and unit variance
self.data.normalize_time_series_array(anomaly)
# Get faster reference to length of time series = number of samples
# per grid point.
n_samples = anomaly.shape[0]
# Initialize mutual information array
mi = np.zeros((self.N, self.N))
# Get common range for all histograms
range_min = anomaly.min()
range_max = anomaly.max()
# Calculate the histograms for each time series
p = np.zeros((self.N, n_bins))
for i in xrange(self.N):
p[i, :] = histogram(
anomaly[:, i], bins=n_bins, range=(range_min, range_max)
)[0].astype("float64")
# Normalize by total number of samples = length of each time series
p /= n_samples
# Make sure that bins with zero estimated probability are not counted
# in the entropy measures.
p[p == 0] = 1
# Compute the information entropies of each time series
H = - (p * log(p)).sum(axis=1)
# Initialize progress bar
if self.silence_level <= 1:
progress = progressbar.ProgressBar(maxval=self.N**2).start()
# Calculate only the lower half of the MI matrix, since MI is
# symmetric with respect to X and Y.
for i in xrange(self.N):
# Update progress bar every 10 steps
if self.silence_level <= 1:
if (i % 10) == 0:
progress.update(i**2)
for j in xrange(i):
# Calculate the joint probability distribution
pxy = histogram2d(
anomaly[:, i], anomaly[:, j], bins=n_bins,
range=((range_min, range_max),
(range_min, range_max)))[0].astype("float64")
# Normalize joint distribution
pxy /= n_samples
# Compute the joint information entropy
pxy[pxy == 0] = 1
HXY = - (pxy * log(pxy)).sum()
# ... and store the result
mi.itemset((i, j), H.item(i) + H.item(j) - HXY)
mi.itemset((j, i), mi.item((i, j)))
if self.silence_level <= 1:
progress.finish()
return mi
[docs] def calculate_similarity_measure(self, anomaly):
"""
Calculate the mutual information matrix.
Encapsulates calculation of mutual information with standard
parameters.
:type anomaly: 2D Numpy array (time, index)
:arg anomaly: The anomaly time series.
:rtype: 2D Numpy array (index, index)
:return: the mutual information matrix at zero lag.
"""
return self._weave_calculate_mutual_information(anomaly)
[docs] def mutual_information(self, anomaly=None, dump=True):
"""
Return mutual information matrix at zero lag.
Check if mutual information matrix (MI) was already calculated before:
- If yes, return MI from a data file.
- If not, return MI from calculation and store in file.
:type anomaly: 2D Numpy array (time, index)
:arg anomaly: The anomaly time series.
:arg bool dump: Store MI in data file.
:rtype: 2D Numpy array (index, index)
:return: the mutual information matrix at zero lag.
"""
try:
# Try to load MI from file
if self.silence_level <= 1:
print "Loading mutual information matrix from %s..." % \
self.mi_file
with open(self.mi_file, 'r') as f:
mi = np.load(f)
# Check if the dimensions of mutual_information correspond to
# the grid.
if mi.shape != (self.N, self.N):
print (self.mi_file +
" in current directory has incorrect dimensions!")
raise RuntimeError
except (IOError, RuntimeError):
if self.silence_level <= 1:
print "An error occured while loading data from %s." % \
self.mi_file
print "Recalculating mutual information."
mi = self._weave_calculate_mutual_information(anomaly)
if dump:
with open(self.mi_file, 'w') as f:
if self.silence_level <= 1:
print "Storing in", self.mi_file
mi.dump(f)
return mi
[docs] def winter_only(self):
"""
Indicate, if only winter months were used for network generation.
:return bool: whether only winter months were used for network
generation.
"""
return self._winter_only
[docs] def _set_winter_only(self, winter_only, dump=False):
"""
Toggle use of exclusively winter data points for network generation.
:arg bool winter_only: Indicates whether only winter months were used
for network generation.
:arg bool dump: Store MI in data file.
"""
self._winter_only = winter_only
if winter_only:
winter_anomaly = self.data.anomaly_selected_months([0, 1, 11])
mi = self.mutual_information(winter_anomaly, dump=dump)
else:
mi = self.mutual_information(self.data.anomaly(), dump=dump)
self._similarity_measure = mi
[docs] def set_winter_only(self, winter_only, dump=True):
"""
Toggle use of exclusively winter data points for network generation.
Also explicitly regenerates the instance of MutualInfoClimateNetwork.
:arg bool winter_only: Indicates whether only winter months were used
for network generation.
:arg bool dump: Store MI in data file.
"""
self._set_winter_only(winter_only, dump=dump)
self._regenerate_network()
#
# Defines methods to calculate weighted network measures
#
[docs] def mutual_information_weighted_average_path_length(self):
"""
Return mutual information weighted average path length.
:return float: the mutual information weighted average path length.
"""
if "mutual_information" not in self._path_lengths_cached:
self.set_link_attribute("mutual_information",
abs(self.mutual_information()))
return self.average_path_length("mutual_information")
[docs] def mutual_information_weighted_closeness(self):
"""
Return mutual information weighted closeness.
:rtype: 1D Numpy array [index]
:return: the mutual information weighted closeness sequence.
"""
if "mutual_information" not in self._path_lengths_cached:
self.set_link_attribute("mutual_information",
abs(self.mutual_information()))
return self.closeness("mutual_information")
[docs] def local_mutual_information_weighted_vulnerability(self):
"""
Return mutual information weighted vulnerability.
:rtype: 1D Numpy array [index]
:return: the mutual information weighted vulnerability sequence.
"""
if "mutual_information" not in self._path_lengths_cached:
self.set_link_attribute("mutual_information",
abs(self.mutual_information()))
return self.local_vulnerability("mutual_information")