networks.py 12.7 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
"""
A suite of functions that assist in carrying out the recurrence analysis
========================================================================

!! Please import as a module and use it as such -- not as a script !!

"""
# Created: Thu May 26, 2016  03:50PM
# Last modified: Tue Aug 15, 2017  01:21pm
# Copyright: Bedartha Goswami <goswami@pik-potsdam.de>

import numpy as np
import igraph as ig

from utils import _printmsg
from utils import _progressbar_start
from utils import _progressbar_update
from utils import _progressbar_finish


def prob_recurrence_network(dist_list, var_span, ld, iqr,
                            thr_lims=(1E-3, 5E-1), tol=5E-4,
                            verbose=False, pbar=False):
    """
    Return probability of recurrence network for given link density.


    The returned recurrence network has real-valued edge weights from the
    closed interval [0, 1] corresponding to a priori estimated
    probabilities of recurrence between each pair of time points with the
    help of upper and lower bounds on the difference distributions of those
    two time points. Note that `time` here refers to the time-ordering of
    the distribution list ``dist_list``.

    Parameters
    ----------
    dist_list : list
        List of time-ordered cumulative probability distributions. Each item
        of this list is of the same size as ``var_span`` and has float
        entries corresponding to the cumulative probability evaluated at the
        corresponding value in ``var_span``.
    var_span : numpy 1D array
        Contains the range of proxy values on which the cumulative
        probabilities provided in each item of ``dist_list`` are evaluated.
        For best results, it is advised to ensure that ``var_span`` should
        cover a range of values wide enough so that the first and last
        entries of all the CDFs are 0 and 1 respectively.
    ld : number, float
        Desired link density of the final recurrence network.
    iqr : number, float
        Inter-quartile range of the total probability distribution function
        (PDF) of the variable when summed over all time instances.
    thr_lims : tuple, float, optional
        Tuple containing the lower and upper limit of recurrence thresholds
        (as a fraction of ``iqr``) within which the bisection routine should
        search for the appropriate network with desired link density. Note
        that ``thr_lims = (thr_lo, thr_hi)``.
    tol : number, float, optional
        Tolerance level for the bisection routine. This number specifies the
        error margin up to which the returned network's link density has to
        be equal to the desired link density specified with ``ld``.
    verbose: bool, optional
        Specifies verbosity of the routine.
    pbar: bool, optional
        Specifies the display of a progress bar in terminal on execution.

    Returns
    -------
    G : igraph.Graph
        igraph.Graph object with edge weights corresponding to the estimated
        probabilities of recurrence between each pair of nodes (i.e., time
        instances of observation). The edge weights can be accessed by
        ``G.es["weight"]``.

    See Also
    --------
    prob_recurrence_matrix
    link_density

    Notes
    -----
    The probability of recurrence matrix estimated here is obtained using
    the function ``prob_recurrence_matrix`` with the only difference that
    the diagonals are set to zero to avoid self-loops in the network. Also,
    the method based on the upper and lower bounds on difference
    distributions as outlined in Goswami (2015) does not allow us to specify
    the link density beforehand. Thus, a bisection routine between
    pre-specified upper and lower recurrence thresholds ``thr_lim`` is used.
    This incurs an additional parameter, the tolerance level ``tol``, i.e.,
    the error between the link density of the returned network and the
    desired link density.

    References
    ----------
    .. [1] Goswami, Bedartha. "Uncertainties in climate data analysis".
    Chapter 6. PhD Dissertation (2015).
    https://publishup.uni-potsdam.de/frontdoor/index/index/docId/7831
    .. [2] Williamson, Robert C., Downs, Tom. "Probabilistic arithmetic I.
    Numerical methods for calculating convolutions and dependency bounds".
    Int. J. Approx. Reasoning (1990). 4:89-158.

    """
    _printmsg("Estimating network with link density = %.3f" % ld, verbose)
    n = len(dist_list)
    valid_lims, lds = _precnet_check_limits(dist_list, var_span, ld, iqr,
                                            thr_lims, verbose, pbar)
    if not valid_lims:
        P = np.zeros((n, n))
        G = _precmat_to_igraph(P)
    else:
        ld_lims = lds
        G, _, _ = _precnet_bisection(dist_list, var_span, ld, ld_lims, iqr,
                                     thr_lims, tol, verbose, pbar)
    return G


def _precnet_check_limits(dist_list, var_span, ld, iqr,
                          thr_lims=(1E-3, 5E-1), verbose=False, pbar=False):
    """Checks if desired link density is within given threshold limits."""
    _printmsg("Checking if threshold limits trap target density...",
              verbose)
    ld_lims = []
    for thr in thr_lims:
        e = thr * iqr
        _printmsg("\tTHRESHOLD = %.2E OF INTER-QUARTILE RANGE"
                  % thr, verbose)
        G = _precnet_igraph(dist_list, var_span, e, verbose, pbar)
        ld_curr = link_density(G)
        _printmsg("\tESTIMATED LINK DENSITY = %.4f" % ld_curr, verbose)
        ld_lims.append(ld_curr)
    cond = (ld >= ld_lims[0]) * (ld <= ld_lims[1])
    if not cond:
        str0 = "Target link density could not be bracketed!"
        str1 = "Increase initial THR bracket or change target LD."
135
        str2 = "LD = %.3f for THR = %.2E and %.3f for THR = %.2f" \
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
               % (ld_lims[0], thr_lims[0], ld_lims[1], thr_lims[1])
        print(str0 + "\n" + str1 + "\n" + str2)
    return cond, ld_lims


def _precnet_bisection(dist_list, var_span, ld, ld_lims, iqr,
                       thr_lims=(1E-3, 5E-1), tol=5E-4,
                       verbose=False, pbar=False):
    """Bisection routine to get network of desired link density."""
    _printmsg("Entering bisection routine...", verbose)
    ld_lo, ld_hi = ld_lims[0], ld_lims[1]
    thr_lo, thr_hi = thr_lims[0], thr_lims[1]
    err = np.abs(ld_hi - ld_lo)
    cond = err > tol
    while cond:
        thr_curr = 0.5 * (thr_lo + thr_hi)
        _printmsg("Current values...", verbose)
        _printmsg("\t...Thresholds = %.5f" % thr_curr, verbose)
        _printmsg("\t...Thresholds (Lo, Hi) = (%.5f, %.5f)"
                  % (thr_lo, thr_hi), verbose)
        _printmsg("\t...LDs (Lo, Hi) = (%.5f, %.5f)"
                  % (ld_lo, ld_hi), verbose)
        _printmsg("\t...Error = %G" % err, verbose)
        e = thr_curr * iqr
        G = _precnet_igraph(dist_list, var_span, e, verbose, pbar)
        ld_curr = link_density(G)
        if ld_curr <= ld:
            thr_lo = thr_curr
            ld_lo = link_density(G)
        else:
            thr_hi = thr_curr
            ld_hi = link_density(G)
        err = np.abs(ld_hi - ld_lo)
        cond = err > tol
    return G, thr_curr, ld_curr


def _precnet_igraph(dist_list, var_span, e, verbose=False, pbar=False):
    """
    Returns weighted igraph object by estimating prob. of rec. mat.
    """
    P = prob_recurrence_matrix(dist_list, var_span, e, verbose, pbar)
178
    np.fill_diagonal(P, 0.)  # remove self-loops
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
    G = _precmat_to_igraph(P)
    return G


def _precmat_to_igraph(P):
    """
    Returns igraph weighted graph for given prob. of. rec. matrix.
    """
    G = ig.Graph.Weighted_Adjacency(P.tolist(), mode=ig.ADJ_UNDIRECTED)
    return G


def link_density(G):
    """
    Returns the link density of an igraph Graph object.

    Since Python igraph does not return the correct link density for a
    weighted graph taking into account the edge weights, this wrapper for
    the igraph.Graph.density function is defined. It uses the
    igraph.Graph.density function in case the graph is unweighted but if
    not, it estimates the link density as the sum over all edge weights
    divided by ``n * (n - 1) / 2``where ``n`` is the number of nodes in the
    graph.

    Parameters
    ----------
    G : igraph Graph
        An igraph Graph object that can be weighted or unweighted, but is
        not directed, for which the link density is to be estimated.

    Returns
    -------
    rho : scalar, float
        The estimated link density for the un/weighted undirected graph G.

    See Also
    --------
    igraph.Graph.density

    """
    if G.is_weighted():
        n = G.vcount()
        sum_e = np.sum(G.es["weight"])
        rho = 2. * sum_e / float(n * (n - 1))
    else:
        rho = G.density()
    return rho


def prob_recurrence_matrix(dist_list, var_span, e=0.1,
                           verbose=False, pbar=False):
    """
    Return probability of recurrence matrix for given recurrence threshold.

    The estimated probability of recurrence matrix contains the a priori
    probabilities of recurrence between each pair of time points with the
    help of upper and lower bounds on the difference distributions of those
    two time points. Note that `time` here refers to the time-ordering of
    the distribution list ``dist_list``.

    Parameters
    ----------
    dist_list : list
        List of time-ordered cumulative probability distributions. Each item
        of this list is of the same size as ``var_span`` and has float
        entries corresponding to the cumulative probability evaluated at the
        corresponding value in ``var_span``.
    var_span : numpy 1D array
        Contains the range of proxy values on which the cumulative
        probabilities provided in each item of ``dist_list`` are evaluated.
        For best results, it is advised to ensure that ``var_span`` should
        cover a range of values wide enough so that the first and last
        entries of all the CDFs are 0 and 1 respectively.
    e : number, float
        Recurrence threshold for which to estimate probability of
        recurrence. This is expressed as a fraction of the inter-quartile
        range (see ``iqr``) and is thus a float between 0 and 1.
    iqr : number, float
        Inter-quartile range of the total probability distribution function
        (PDF) of the variable when summed over all time instances.
    verbose: bool, optional
        Specifies verbosity of the routine.
    pbar: bool, optional
        Specifies the display of a progress bar in terminal on execution.

    Returns
    -------
    P_rec : numpy 2D array
        The probability of recurrence matrix containing float values falling
        in the interval [0, 1]. The shape of this array is ``(n, n)`` where
        ``n``is the length of the dataset, i.e., ``n = len(distlist)``.

    See Also
    --------
    prob_recurrence_network

    Notes
    -----
    The probability of recurrence for each pair of time instances is
    estimated by first estimating the upper and lower bounds on the
    difference distribution (cf. Williamson & Down, 1990) and thereafter
    using those bounds to derive upper and lower bounds on the probability
    of recurrence itself. The expected probability of recurrence is then
    proved to be the mean of the bounds obtained thus (cf. Goswami (2015)).

    References
    ----------
    .. [1] Goswami, Bedartha. "Uncertainties in climate data analysis".
    Chapter 6. PhD Dissertation (2015).
    https://publishup.uni-potsdam.de/frontdoor/index/index/docId/7831
    .. [2] Williamson, Robert C., Downs, Tom. "Probabilistic arithmetic I.
    Numerical methods for calculating convolutions and dependency bounds".
    Int. J. Approx. Reasoning (1990). 4:89-158.

    """
    u = var_span
    n = len(dist_list)
    P_rec = np.zeros((n, n))
    _printmsg("\tPairwise recurrence probability bounds...", verbose)
    f1, f2 = np.zeros((n, len(u))), np.zeros((n, len(u)))
    for j in range(n):
300
301
        f1[j, :] = np.interp(u - e, u, dist_list[j])  # for z = e
        f2[j, :] = np.interp(u + e, u, dist_list[j])  # for z = -e
302
303
        prog_bar = _progressbar_start(n, pbar)
    for i in range(n):
304
        fi = np.interp(u, u, dist_list[i])  # Xi.cdf(u)
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
        plus_lo, plus_hi = _bounds_williamson(fi, f1)
        mnus_lo, mnus_hi = _bounds_williamson(fi, f2)
        diff = plus_lo - mnus_hi
        Pij_l = np.fmax(diff, np.zeros(diff.shape))
        diff = plus_hi - mnus_lo
        Pij_u = np.fmin(diff, np.ones(diff.shape))
        P_rec[i, :] = (Pij_l + Pij_u) / 2.
        P_rec[i, i] = 1.
        _progressbar_update(prog_bar, i)
    _progressbar_finish(prog_bar)
    return P_rec


def _bounds_williamson(fi, g):
    """
    Returns bounds on the probability of diff. distribution.
    """
    F = fi - g
    m, M = np.max(F, axis=1), 1. + np.min(F, axis=1)
    m[m < 0.] = 0.
    M[M > 1.] = 1.
    return m, M