#!/usr/bin/env python
"""
Multi-dimensional non-parametric kernel regression
This module was written by Matthias Cuntz while at Department of
Computational Hydrosystems, Helmholtz Centre for Environmental
Research - UFZ, Leipzig, Germany, and continued while at Institut
National de Recherche pour l'Agriculture, l'Alimentation et
l'Environnement (INRAE), Nancy, France.
:copyright: Copyright 2012-2022 Matthias Cuntz, see AUTHORS.rst for details.
:license: MIT License, see LICENSE for details.
.. moduleauthor:: Matthias Cuntz
The following functions are provided
.. autosummary::
kernel_regression_h
kernel_regression
History
* Written, Jun 2012 by Matthias Cuntz - mc (at) macu (dot) de,
inspired by Matlab routines of Yingying Dong, Boston College and
Yi Cao, Cranfield University
* Assert correct input, Apr 2014, Matthias Cuntz
* Corrected bug in _boot_h: x.size->x.shape[0], Jan 2018, Matthias Cuntz
* Code refactoring, Sep 2021, Matthias Cuntz
* Use format strings, Apr 2022, Matthias Cuntz
* Use minimize with method TNC instead of fmin_tnc,
Apr 2022, Matthias Cuntz
* Use helper function array2input to assure correct output type,
Apr 2022, Matthias Cuntz
* Return scalar h if 1-dimensional, Apr 2022, Matthias Cuntz
* Output type is same as y instead of x or xout, Apr 2022, Matthias Cuntz
"""
import numpy as np
import scipy.optimize as opt
from .division import division
from .helper import array2input
# from division import division
# from helper import array2input
__all__ = ['kernel_regression_h', 'kernel_regression']
def _nadaraya_watson(z, y):
"""
Helper function that calculates the Nadaraya-Watson estimator
for a given kernel.
Until now there is only the gaussian kernel.
"""
kerf = 1. / np.sqrt(2. * np.pi) * np.exp(-0.5 * z * z)
w = np.prod(kerf, 1)
out = division(np.dot(w, y), np.sum(w), np.nan)
return out
def _cross_valid_h(h, x, y):
"""
Helper function that calculates cross-validation function for the
Nadaraya-Watson estimator, which is basically the mean square error
where model estimate is replaced by the jackknife estimate
(Hardle and Muller 2000).
"""
n = x.shape[0]
# allocate output
out = np.empty(n)
# Loop through each regression point
for i in range(n):
# all-1 points
xx = np.delete(x, i, axis=0)
yy = np.delete(y, i, axis=0)
z = (xx - x[i, :]) / h
out[i] = _nadaraya_watson(z, yy)
cv = np.sum((y - out)**2) / float(n)
return cv
def _boot_h(h, x, y):
"""
Helper function that calculates bootstrap function for the
Nadaraya-Watson estimator, which is basically the mean square error
where model estimate is replaced by the jackknife estimate
(Hardle and Muller 2000).
This does basically _cross_valid_h for 100 random points.
"""
n = 100
ind = np.random.randint(x.shape[0], size=n)
# allocate output
out = np.empty(n)
# Loop through each bootstrap point
for i in range(n):
# all-1 points
xx = np.delete(x, i, axis=0)
yy = np.delete(y, i, axis=0)
z = (xx - x[i, :]) / h
out[i] = _nadaraya_watson(z, yy)
cv = np.sum((y[ind] - out)**2) / float(n)
return cv
[docs]def kernel_regression_h(x, y, silverman=False):
"""
Optimal bandwidth for multi-dimensional non-parametric kernel regression
Optimal bandwidth is determined using cross-validation or
Silverman's rule-of-thumb.
Parameters
----------
x : array_like (n, k)
Independent values
y : array_like (n)
Dependent values
silverman : bool, optional
Use Silverman's rule-of-thumb to calculate bandwidth *h* if True,
otherwise determine *h* via cross-validation
Returns
-------
float or array
Optimal bandwidth *h*. If multidimensional regression then *h* is a
1d-array, assuming a diagonal bandwidth matrix.
References
----------
Hardle W and Muller M (2000) Multivariate and semiparametric kernel
regression. In MG Schimek (Ed.), Smoothing and regression: Approaches,
computation, and application (pp. 357-392). Hoboken, NJ, USA: John Wiley
& Sons, Inc. doi: 10.1002/9781118150658.ch12
Examples
--------
>>> n = 10
>>> x = np.zeros((n, 2))
>>> x[:, 0] = np.arange(n, dtype=float) / float(n-1)
>>> x[:, 1] = 1. / (np.arange(n, dtype=float) / float(n-1) + 0.1)
>>> y = 1. + x[:, 0]**2 - np.sin(x[:, 1])**2
>>> h = kernel_regression_h(x, y)
>>> print(np.allclose(h, [0.172680, 9.516907], atol=0.0001))
True
>>> h = kernel_regression_h(x, y, silverman=True)
>>> print(np.allclose(h, [0.229190, 1.903381], atol=0.0001))
True
>>> n = 10
>>> x = np.arange(n, dtype=float) / float(n-1)
>>> y = 1. + x**2 - np.sin(x)**2
>>> h = kernel_regression_h(x, y)
>>> print(np.around(h, 4))
0.045
>>> h = kernel_regression_h(x, y, silverman=True)
>>> print(np.around(h, 4))
0.2248
"""
xx = np.array(x)
yy = np.array(y)
# Check input
assert xx.shape[0] == yy.size, (
f'size(x, 0) != size(y): {xx.shape[0]} != {yy.size}' )
if xx.ndim == 1: # deal with 1d-arrays
xx = xx[:, np.newaxis]
n = xx.shape[0]
d = xx.shape[1]
# Silverman (1986), Scott (1992), Bowman and Azzalini (1997)
# Very similar to stats.gaussian_kde
# h has dimension d
h = ( (4. / float(d + 2) / float(n))**(1. / float(d + 4)) *
np.std(xx, axis=0, ddof=1) )
if not silverman:
# Find the optimal h
bounds = [(0.2*i, 5.0*i) for i in h]
if n <= 100:
res = opt.minimize(
_cross_valid_h, h, args=(xx, yy), method='TNC', bounds=bounds,
options={'ftol': 1e-10, 'xtol': 1e-10, 'maxfun': 1000})
h = res.x
else:
res = opt.minimize(
_boot_h, h, args=(xx, yy), method='TNC', bounds=bounds,
options={'ftol': 1e-10, 'xtol': 1e-10, 'maxfun': 1000})
h = res.x
if len(h) == 1:
h = h[0]
return h
[docs]def kernel_regression(x, y, h=None, silverman=False, xout=None):
"""
Multi-dimensional non-parametric kernel regression
Optimal bandwidth can be estimated by cross-validation
or by using Silverman's rule-of-thumb.
Parameters
----------
x : array_like (n, k)
Independent values
y : array_like (n)
Dependent values
h : float or array_like(k), optional
Use *h* as bandwidth for calculating regression values if given,
otherwise determine optimal *h* using cross-validation or by using
Silverman's rule-of-thumb if *silverman==True*.
silverman : bool, optional
Use Silverman's rule-of-thumb to calculate bandwidth *h* if True,
otherwise determine *h* via cross-validation.
Only used if *h* is not given.
xout : ndarray(n, k), optional
Return fitted values at *xout* if given,
otherwise return fitted values at *x*.
Returns
-------
array_like with same type as *x*, or *xout* if given
Fitted values at *x*, or at *xout* if given
References
----------
Hardle W and Muller M (2000) Multivariate and semiparametric kernel
regression. In MG Schimek (Ed.), Smoothing and regression: Approaches,
computation, and application (pp. 357-392). Hoboken, NJ, USA: John Wiley
& Sons, Inc. doi: 10.1002/9781118150658.ch12
Examples
--------
>>> n = 10
>>> x = np.zeros((n, 2))
>>> x[:, 0] = np.arange(n, dtype=float) / float(n-1)
>>> x[:, 1] = 1. / (np.arange(n, dtype=float) / float(n-1) + 0.1)
>>> y = 1. + x[:, 0]**2 - np.sin(x[:, 1])**2
Separate determination of h and kernel regression
>>> h = kernel_regression_h(x, y)
>>> yk = kernel_regression(x, y, h)
>>> print(np.allclose(yk[0:6],
... [0.52241, 0.52570, 0.54180, 0.51781, 0.47644, 0.49230],
... atol=0.0001))
True
Single call to kernel regression
>>> yk = kernel_regression(x, y)
>>> print(np.allclose(yk[0:6],
... [0.52241, 0.52570, 0.54180, 0.51781, 0.47644, 0.49230],
... atol=0.0001))
True
Single call to kernel regression using Silverman's rule-of-thumb for h
>>> yk = kernel_regression(x, y, silverman=True)
>>> print(np.allclose(yk[0:6],
... [0.691153, 0.422809, 0.545844, 0.534315, 0.521494, 0.555426],
... atol=0.0001))
True
>>> n = 5
>>> xx = np.empty((n, 2))
>>> xx[:, 0] = (np.amin(x[:, 0]) + (np.amax(x[:, 0]) - np.amin(x[:, 0])) *
... np.arange(n, dtype=float) / float(n))
>>> xx[:, 1] = (np.amin(x[:, 1]) + (np.amax(x[:, 1]) - np.amin(x[:, 1])) *
... np.arange(n, dtype=float) / float(n))
>>> yk = kernel_regression(x, y, silverman=True, xout=xx)
>>> print(np.allclose(yk,
... [0.605485, 0.555235, 0.509529, 0.491191, 0.553325],
... atol=0.0001))
True
"""
xx = np.array(x)
yy = np.array(y)
# Check input
assert xx.shape[0] == yy.size, (
f'size(x,0) != size(y): {xx.shape[0]} != {yy.size}' )
# deal with 1d-arrays and save 1d input type
if xx.ndim == 1:
xx = xx[:, np.newaxis]
d = xx.shape[1]
# determine h
if h is None:
hh = kernel_regression_h(xx, yy, silverman=silverman)
else:
if np.size(np.shape(h)) == 0:
hh = np.repeat(h, d) # use h for all dimensions if one h given
else:
hh = np.array(h)
assert np.size(hh) == d, 'size(h) must be 1 or x.shape[1]'
# Calc regression
if xout is None:
xxout = xx
else:
xxout = np.array(xout)
if xxout.ndim == 1:
xxout = xxout[:, np.newaxis]
nout = xxout.shape[0]
dout = xxout.shape[1]
assert d == dout, f'size(x, 1) != size(xout, 1): {d} != {dout}'
# allocate output
out = np.empty(nout)
# Loop through each regression point
for i in range(nout):
# scaled deference from regression point
z = (xx - xxout[i, :]) / hh
# nadaraya-watson estimator of gaussian multivariate kernel
out[i] = _nadaraya_watson(z, y)
out = array2input(out, y, undef=np.nan)
return out
if __name__ == '__main__':
import doctest
doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
# nn = 1000
# x = np.zeros((nn, 2))
# x[:, 0] = np.arange(nn, dtype=float) / float(nn-1)
# x[:, 1] = 1. / (x[:, 0] + 0.1)
# y = 1. + x[:, 0]**2 - np.sin(x[:, 1])**2
# h = kernel_regression_h(x, y)
# print(h)
# yy = kernel_regression(x, y, h, xout=x)
# print(yy[0], yy[-1])
# print(kernel_regression(x, y))
# h = kernel_regression_h(x, y, silverman=True)
# print(h)
# print(kernel_regression(x, y, h))
# ss = np.shape(x)
# nn = 5
# xx = np.empty((nn, ss[1]))
# xx[:,0] = (np.amin(x[:,0]) + (np.amax(x[:,0])-np.amin(x[:,0])) *
# np.arange(nn,dtype=float)/float(nn))
# xx[:,1] = (np.amin(x[:,1]) + (np.amax(x[:,1])-np.amin(x[:,1])) *
# np.arange(nn,dtype=float)/float(nn))
# print(kernel_regression(x, y, h, xout=xx))