Module mici.adapters
Methods for adaptively setting algorithmic parameters of transitions.
Expand source code Browse git
"""Methods for adaptively setting algorithmic parameters of transitions."""
from abc import ABC, abstractmethod, abstractproperty
from math import exp, log
import numpy as np
from mici.errors import IntegratorError, AdaptationError
from mici.matrices import (
PositiveDiagonalMatrix, DensePositiveDefiniteMatrix)
class Adapter(ABC):
"""Abstract adapter for implementing schemes to adapt transition parameters.
Adaptation schemes are assumed to be based on updating a collection of
adaptation variables (collectively termed the adapter state here) after
each chain transition based on the sampled chain state and/or statistics of
the transition such as an acceptance probability statistic. After completing
a chain of one or more adaptive transitions, the final adapter state may be
used to perform a final update to the transition parameters.
"""
@abstractmethod
def initialize(self, chain_state, transition):
"""Initialize adapter state prior to starting adaptive transitions.
Args:
chain_state (mici.states.ChainState): Initial chain state adaptive
transition will be started from. May be used to calculate
initial adapter state but should not be mutated by method.
transition (mici.transitions.Transition): Markov transition being
adapted. Attributes of the transition or child objects may be
updated in-place by the method.
Returns:
adapt_state (Dict[str, Any]): Initial adapter state.
"""
@abstractmethod
def update(self, adapt_state, chain_state, trans_stats, transition):
"""Update adapter state after sampling transition being adapted.
Args:
adapt_state (Dict[str, Any]): Current adapter state. Entries will
be updated in-place by the method.
chain_state (mici.states.ChainState): Current chain state following
sampling from transition being adapted. May be used to calculate
adapter state updates but should not be mutated by method.
trans_stats (Dict[str, numeric]): Dictionary of statistics
associated with transition being adapted. May be used to
calculate adapter state updates but should not be mutated by
method.
transition (mici.transitions.Transition): Markov transition being
adapted. Attributes of the transition or child objects may be
updated in-place by the method.
"""
@abstractmethod
def finalize(self, adapt_state, transition):
"""Update transition parameters based on final adapter state or states.
Optionally, if multiple adapter states are available, e.g. from a set of
independent adaptive chains, then these adaptation information from all
the chains may be combined to set the transition parameter(s).
Args:
adapt_state (Dict[str, Any] or List[Dict[str, Any]]): Final adapter
state or a list of adapter states. Arrays / buffers associated
with the adapter state entries may be recycled to reduce memory
usage - if so the corresponding entries will be removed from
the adapter state dictionary / dictionaries.
transition (mici.transitions.Transition): Markov transition being
adapted. Attributes of the transition or child objects will be
updated in-place by the method.
"""
class DualAveragingStepSizeAdapter(Adapter):
"""Dual averaging integrator step size adapter.
Implementation of the dual algorithm step size adaptation algorithm
described in [1], a modified version of the stochastic optimisation scheme
of [2]. By default the adaptation is performed to control the `accept_prob`
statistic of an integration transition to be close to a target value but
the statistic adapted on can be altered by changing the `adapt_stat_func`.
References:
1. Hoffman, M.D. and Gelman, A., 2014. The No-U-turn sampler:
adaptively setting path lengths in Hamiltonian Monte Carlo.
Journal of Machine Learning Research, 15(1), pp.1593-1623.
2. Nesterov, Y., 2009. Primal-dual subgradient methods for convex
problems. Mathematical programming 120(1), pp.221-259.
"""
def __init__(self, adapt_stat_target=0.8, adapt_stat_func=None,
log_step_size_reg_target=None,
log_step_size_reg_coefficient=0.05, iter_decay_coeff=0.75,
iter_offset=10, max_init_step_size_iters=100):
"""
Args:
adapt_stat_target (float): Target value for the transition statistic
being controlled during adaptation.
adapt_stat_func (Callable[[Dict[str, numeric]], numeric]): Function
which given a dictionary of transition statistics outputs the
value of the statistic to control during adaptation. By default
this is set to a function which simply selects the 'accept_stat'
value in the statistics dictionary.
log_step_size_reg_target (float or None): Value to regularize the
controlled output (logarithm of the integrator step size)
towards. If `None` set to `log(10 * init_step_size)` where
`init_step_size` is the initial 'reasonable' step size found by
a coarse search as recommended in Hoffman and Gelman (2014).
This has the effect of giving the dual averaging algorithm a
tendency towards testing step sizes larger than the initial
value, with typically integrating with a larger step size having
a lower computational cost.
log_step_size_reg_coefficient (float): Coefficient controlling
amount of regularisation of controlled output (logarithm of the
integrator step size) towards `log_step_size_reg_target`.
Defaults to 0.05 as recommended in Hoffman and Gelman (2014).
iter_decay_coeff (float): Coefficient controlling exponent of
decay in schedule weighting stochastic updates to smoothed log
step size estimate. Should be in the interval (0.5, 1] to ensure
asymptotic convergence of adaptation. A value of 1 gives equal
weight to the whole history of updates while setting to a
smaller value increasingly highly weights recent updates, giving
a tendency to 'forget' early updates. Defaults to 0.75 as
recommended in Hoffman and Gelman (2014).
iter_offset (int): Offset used for the iteration based weighting of
the adaptation statistic error estimate. Should be set to a
non-negative value. A value > 0 has the effect of stabilising
early iterations. Defaults to the value of 10 as recommended in
Hoffman and Gelman (2014).
max_init_step_size_iters (int): Maximum number of iterations to use
in initial search for a reasonable step size with an
`AdaptationError` exception raised if a suitable step size is
not found within this many iterations.
"""
self.adapt_stat_target = adapt_stat_target
if adapt_stat_func is None:
def adapt_stat_func(stats): return stats['accept_stat']
self.adapt_stat_func = adapt_stat_func
self.log_step_size_reg_target = log_step_size_reg_target
self.log_step_size_reg_coefficient = log_step_size_reg_coefficient
self.iter_decay_coeff = iter_decay_coeff
self.iter_offset = iter_offset
self.max_init_step_size_iters = max_init_step_size_iters
def initialize(self, chain_state, transition):
integrator = transition.integrator
system = transition.system
adapt_state = {
'iter': 0,
'smoothed_log_step_size': 0.,
'adapt_stat_error': 0.,
}
init_step_size = (self._find_and_set_init_step_size(
chain_state, system, integrator) if integrator.step_size is None
else integrator.step_size)
if self.log_step_size_reg_target is None:
adapt_state['log_step_size_reg_target'] = log(10 * init_step_size)
else:
adapt_state['log_step_size_reg_target'] = (
self.log_step_size_reg_target)
return adapt_state
def _find_and_set_init_step_size(self, state, system, integrator):
"""Find initial step size by coarse search using single step statistics.
Adaptation of Algorithm 4 in Hoffman and Gelman (2014).
Compared to the Hoffman and Gelman algorithm, this version makes two
changes:
1. The absolute value of the change in Hamiltonian over a step being
larger or smaller than log(2) is used to determine whether the step
size is too big or small as opposed to the value of the equivalent
Metropolis accept probability being larger or smaller than 0.5.
Although a negative change in the Hamiltonian over a step of
magnitude more than log(2) will lead to an accept probability of 1
for the forward move, the corresponding reversed move will have an
accept probability less than 0.5, and so a change in the
Hamiltonian over a step of magnitude more than log(2) irrespective
of the sign of the change is indicative of the minimum acceptance
probability over both forward and reversed steps being less than
0.5.
2. To allow for integrators for which an integrator step may fail due
to e.g. a convergence error in an iterative solver, the step size
is also considered to be too big if any of the step sizes tried in
the search result in a failed integrator step, with in this case
the step size always being decreased on subsequent steps
irrespective of the initial Hamiltonian error, until a integrator
step successfully completes and the absolute value of the change in
Hamiltonian is below the threshold of log(2) (corresponding to a
minimum acceptance probability over forward and reversed steps of
0.5).
"""
init_state = state.copy()
h_init = system.h(init_state)
integrator.step_size = 1
delta_h_threshold = log(2)
for s in range(self.max_init_step_size_iters):
try:
state = integrator.step(init_state)
delta_h = abs(h_init - system.h(state))
if s == 0:
step_size_too_big = delta_h > delta_h_threshold
if (step_size_too_big and delta_h <= delta_h_threshold) or (
not step_size_too_big and delta_h > delta_h_threshold):
return integrator.step_size
elif step_size_too_big:
integrator.step_size /= 2
else:
integrator.step_size *= 2
except IntegratorError:
step_size_too_big = True
integrator.step_size /= 2
raise AdaptationError(
f'Could not find reasonable initial step size in '
f'{self.max_init_step_size_iters} iterations (final step size '
f'{integrator.step_size}). A very large final step size may '
f'indicate that the target distribution is improper such that the '
f'negative log density is flat in one or more directions while a '
f'very small final step size may indicate that the density function'
f' is insufficiently smooth at the point initialized at.')
def update(self, adapt_state, chain_state, trans_stats, transition):
adapt_state['iter'] += 1
error_weight = 1 / (self.iter_offset + adapt_state['iter'])
adapt_state['adapt_stat_error'] *= (1 - error_weight)
adapt_state['adapt_stat_error'] += error_weight * (
self.adapt_stat_target - self.adapt_stat_func(trans_stats))
smoothing_weight = (1 / adapt_state['iter'])**self.iter_decay_coeff
log_step_size = adapt_state['log_step_size_reg_target'] - (
adapt_state['adapt_stat_error'] * adapt_state['iter']**0.5 /
self.log_step_size_reg_coefficient)
adapt_state['smoothed_log_step_size'] *= (1 - smoothing_weight)
adapt_state['smoothed_log_step_size'] += (
smoothing_weight * log_step_size)
transition.integrator.step_size = exp(log_step_size)
def finalize(self, adapt_state, transition):
if isinstance(adapt_state, dict):
transition.integrator.step_size = exp(
adapt_state['smoothed_log_step_size'])
else:
transition.integrator.step_size = sum(
exp(a['smoothed_log_step_size'])
for a in adapt_state) / len(adapt_state)
class OnlineVarianceMetricAdapter(Adapter):
"""Diagonal metric adapter using online variance estimates.
Uses Welford's algorithm [1] to stably compute an online estimate of the
sample variances of the chain state position components during sampling. If
online estimates are available from multiple independent chains, the final
variance estimate is calculated from the per-chain statistics using the
parallel / batched incremental variance algorithm described by Chan et al.
[2]. The variance estimates are optionally regularized towards a common
scalar value, with increasing weight for small number of samples, to
decrease the effect of noisy estimates for small sample sizes, following the
approach in Stan [3]. The metric matrix representation is set to a diagonal
matrix with diagonal elements corresponding to the reciprocal of the
(regularized) variance estimates.
References:
1. Welford, B. P., 1962. Note on a method for calculating corrected sums
of squares and products. Technometrics, 4(3), pp. 419–420.
2. Chan, T. F., Golub, G. H., LeVeque, R. J., 1979. Updating formulae and
a pairwise algorithm for computing sample variances. Technical Report
STAN-CS-79-773, Department of Computer Science, Stanford University.
3. Carpenter, B., Gelman, A., Hoffman, M.D., Lee, D., Goodrich, B.,
Betancourt, M., Brubaker, M., Guo, J., Li, P. and Riddell, A., 2017.
Stan: A probabilistic programming language. Journal of Statistical
Software, 76(1).
"""
def __init__(self, reg_iter_offset=5, reg_scale=1e-3):
"""
Args:
reg_iter_offset (int): Iteration offset used for calculating
iteration dependent weighting between regularisation target and
current covariance estimate. Higher values cause stronger
regularisation during initial iterations. A value of zero
corresponds to no regularisation; this should only be used if
the sample covariance is guaranteed to be positive definite.
reg_scale (float): Positive scalar defining value variance estimates
are regularized towards.
"""
self.reg_iter_offset = reg_iter_offset
self.reg_scale = reg_scale
def initialize(self, chain_state, transition):
return {
'iter': 0,
'mean': np.zeros_like(chain_state.pos),
'sum_diff_sq': np.zeros_like(chain_state.pos)
}
def update(self, adapt_state, chain_state, trans_stats, transition):
# Use Welford (1962) incremental algorithm to update statistics to
# calculate online variance estimate
# https://en.wikipedia.org/wiki/
# Algorithms_for_calculating_variance#Welford's_online_algorithm
adapt_state['iter'] += 1
pos_minus_mean = chain_state.pos - adapt_state['mean']
adapt_state['mean'] += pos_minus_mean / adapt_state['iter']
adapt_state['sum_diff_sq'] += pos_minus_mean * (
chain_state.pos - adapt_state['mean'])
def _regularize_var_est(self, var_est, n_iter):
"""Update variance estimates by regularizing towards common scalar.
Performed in place to prevent further array allocations.
"""
if self.reg_iter_offset is not None and self.reg_iter_offset != 0:
var_est *= n_iter / (self.reg_iter_offset + n_iter)
var_est += self.reg_scale * (
self.reg_iter_offset / (self.reg_iter_offset + n_iter))
def finalize(self, adapt_state, transition):
if isinstance(adapt_state, dict):
n_iter = adapt_state['iter']
var_est = adapt_state.pop('sum_diff_sq')
else:
# Use Chan et al. (1979) parallel variance estimation algorithm
# to combine per-chain statistics
# https://en.wikipedia.org/wiki/
# Algorithms_for_calculating_variance#Parallel_algorithm
for i, a in enumerate(adapt_state):
if i == 0:
n_iter = a['iter']
mean_est = a.pop('mean')
var_est = a.pop('sum_diff_sq')
else:
n_iter_prev = n_iter
n_iter += a['iter']
mean_diff = mean_est - a['mean']
mean_est *= n_iter_prev
mean_est += a['iter'] * a['mean']
mean_est /= n_iter
var_est += a['sum_diff_sq']
var_est += mean_diff**2 * (a['iter'] * n_iter_prev) / n_iter
if n_iter < 2:
raise AdaptationError(
'At least two chain samples required to compute a variance '
'estimates.')
var_est /= (n_iter - 1)
self._regularize_var_est(var_est, n_iter)
transition.system.metric = PositiveDiagonalMatrix(var_est).inv
class OnlineCovarianceMetricAdapter(Adapter):
"""Dense metric adapter using online covariance estimates.
Uses Welford's algorithm [1] to stably compute an online estimate of the
sample covariane matrix of the chain state position components during
sampling. If online estimates are available from multiple independent
chains, the final covariance matrix estimate is calculated from the
per-chain statistics using a covariance variant due to Schubert and Gertz
[2] of the parallel / batched incremental variance algorithm described by
Chan et al. [3]. The covariance matrix estimates are optionally regularized
towards a scaled identity matrix, with increasing weight for small number of
samples, to decrease the effect of noisy estimates for small sample sizes,
following the approach in Stan [4]. The metric matrix representation is set
to a dense positive definite matrix corresponding to the inverse of the
(regularized) covariance matrix estimate.
References:
1. Welford, B. P., 1962. Note on a method for calculating corrected sums
of squares and products. Technometrics, 4(3), pp. 419–420.
2. Schubert, E. and Gertz, M., 2018. Numerically stable parallel
computation of (co-)variance. ACM. p. 10. doi:10.1145/3221269.3223036.
3. Chan, T. F., Golub, G. H., LeVeque, R. J., 1979. Updating formulae and
a pairwise algorithm for computing sample variances. Technical Report
STAN-CS-79-773, Department of Computer Science, Stanford University.
4. Carpenter, B., Gelman, A., Hoffman, M.D., Lee, D., Goodrich, B.,
Betancourt, M., Brubaker, M., Guo, J., Li, P. and Riddell, A., 2017.
Stan: A probabilistic programming language. Journal of Statistical
Software, 76(1).
"""
def __init__(self, reg_iter_offset=5, reg_scale=1e-3):
"""
Args:
reg_iter_offset (int): Iteration offset used for calculating
iteration dependent weighting between regularisation target and
current covariance estimate. Higher values cause stronger
regularisation during initial iterations.
reg_scale (float): Positive scalar defining value variance estimates
are regularized towards.
"""
self.reg_iter_offset = reg_iter_offset
self.reg_scale = reg_scale
def initialize(self, chain_state, transition):
dim_pos = chain_state.pos.shape[0]
dtype = chain_state.pos.dtype
return {
'iter': 0,
'mean': np.zeros(shape=(dim_pos,), dtype=dtype),
'sum_diff_outer': np.zeros(shape=(dim_pos, dim_pos), dtype=dtype)
}
def update(self, adapt_state, chain_state, trans_stats, transition):
# Use Welford (1962) incremental algorithm to update statistics to
# calculate online covariance estimate
# https://en.wikipedia.org/wiki/
# Algorithms_for_calculating_variance#Online
adapt_state['iter'] += 1
pos_minus_mean = chain_state.pos - adapt_state['mean']
adapt_state['mean'] += pos_minus_mean / adapt_state['iter']
adapt_state['sum_diff_outer'] += pos_minus_mean[None, :] * (
chain_state.pos - adapt_state['mean'])[:, None]
def _regularize_covar_est(self, covar_est, n_iter):
"""Update covariance estimate by regularising towards identity.
Performed in place to prevent further array allocations.
"""
covar_est *= (n_iter / (self.reg_iter_offset + n_iter))
covar_est_diagonal = np.einsum('ii->i', covar_est)
covar_est_diagonal += self.reg_scale * (
self.reg_iter_offset / (self.reg_iter_offset + n_iter))
def finalize(self, adapt_state, transition):
if isinstance(adapt_state, dict):
n_iter = adapt_state['iter']
covar_est = adapt_state.pop('sum_diff_outer')
else:
# Use Schubert and Gertz (2018) parallel covariance estimation
# algorithm to combine per-chain statistics
for i, a in enumerate(adapt_state):
if i == 0:
n_iter = a['iter']
mean_est = a.pop('mean')
covar_est = a.pop('sum_diff_outer')
else:
n_iter_prev = n_iter
n_iter += a['iter']
mean_diff = mean_est - a['mean']
mean_est *= n_iter_prev
mean_est += a['iter'] * a['mean']
mean_est /= n_iter
covar_est += a['sum_diff_outer']
covar_est += np.outer(mean_diff, mean_diff) * (
a['iter'] * n_iter_prev) / n_iter
if n_iter < 2:
raise AdaptationError(
'At least two chain samples required to compute a variance '
'estimates.')
covar_est /= (n_iter - 1)
self._regularize_covar_est(covar_est, n_iter)
transition.system.metric = DensePositiveDefiniteMatrix(covar_est).inv
Classes
class Adapter (*args, **kwargs)
-
Abstract adapter for implementing schemes to adapt transition parameters.
Adaptation schemes are assumed to be based on updating a collection of adaptation variables (collectively termed the adapter state here) after each chain transition based on the sampled chain state and/or statistics of the transition such as an acceptance probability statistic. After completing a chain of one or more adaptive transitions, the final adapter state may be used to perform a final update to the transition parameters.
Expand source code Browse git
class Adapter(ABC): """Abstract adapter for implementing schemes to adapt transition parameters. Adaptation schemes are assumed to be based on updating a collection of adaptation variables (collectively termed the adapter state here) after each chain transition based on the sampled chain state and/or statistics of the transition such as an acceptance probability statistic. After completing a chain of one or more adaptive transitions, the final adapter state may be used to perform a final update to the transition parameters. """ @abstractmethod def initialize(self, chain_state, transition): """Initialize adapter state prior to starting adaptive transitions. Args: chain_state (mici.states.ChainState): Initial chain state adaptive transition will be started from. May be used to calculate initial adapter state but should not be mutated by method. transition (mici.transitions.Transition): Markov transition being adapted. Attributes of the transition or child objects may be updated in-place by the method. Returns: adapt_state (Dict[str, Any]): Initial adapter state. """ @abstractmethod def update(self, adapt_state, chain_state, trans_stats, transition): """Update adapter state after sampling transition being adapted. Args: adapt_state (Dict[str, Any]): Current adapter state. Entries will be updated in-place by the method. chain_state (mici.states.ChainState): Current chain state following sampling from transition being adapted. May be used to calculate adapter state updates but should not be mutated by method. trans_stats (Dict[str, numeric]): Dictionary of statistics associated with transition being adapted. May be used to calculate adapter state updates but should not be mutated by method. transition (mici.transitions.Transition): Markov transition being adapted. Attributes of the transition or child objects may be updated in-place by the method. """ @abstractmethod def finalize(self, adapt_state, transition): """Update transition parameters based on final adapter state or states. Optionally, if multiple adapter states are available, e.g. from a set of independent adaptive chains, then these adaptation information from all the chains may be combined to set the transition parameter(s). Args: adapt_state (Dict[str, Any] or List[Dict[str, Any]]): Final adapter state or a list of adapter states. Arrays / buffers associated with the adapter state entries may be recycled to reduce memory usage - if so the corresponding entries will be removed from the adapter state dictionary / dictionaries. transition (mici.transitions.Transition): Markov transition being adapted. Attributes of the transition or child objects will be updated in-place by the method. """
Ancestors
- abc.ABC
Subclasses
Methods
def initialize(self, chain_state, transition)
-
Initialize adapter state prior to starting adaptive transitions.
Args
chain_state
:ChainState
- Initial chain state adaptive transition will be started from. May be used to calculate initial adapter state but should not be mutated by method.
transition
:Transition
- Markov transition being adapted. Attributes of the transition or child objects may be updated in-place by the method.
Returns
adapt_state
:Dict
[str
,Any
]- Initial adapter state.
Expand source code Browse git
@abstractmethod def initialize(self, chain_state, transition): """Initialize adapter state prior to starting adaptive transitions. Args: chain_state (mici.states.ChainState): Initial chain state adaptive transition will be started from. May be used to calculate initial adapter state but should not be mutated by method. transition (mici.transitions.Transition): Markov transition being adapted. Attributes of the transition or child objects may be updated in-place by the method. Returns: adapt_state (Dict[str, Any]): Initial adapter state. """
def update(self, adapt_state, chain_state, trans_stats, transition)
-
Update adapter state after sampling transition being adapted.
Args
adapt_state
:Dict
[str
,Any
]- Current adapter state. Entries will be updated in-place by the method.
chain_state
:ChainState
- Current chain state following sampling from transition being adapted. May be used to calculate adapter state updates but should not be mutated by method.
trans_stats
:Dict
[str
,numeric
]- Dictionary of statistics associated with transition being adapted. May be used to calculate adapter state updates but should not be mutated by method.
transition
:Transition
- Markov transition being adapted. Attributes of the transition or child objects may be updated in-place by the method.
Expand source code Browse git
@abstractmethod def update(self, adapt_state, chain_state, trans_stats, transition): """Update adapter state after sampling transition being adapted. Args: adapt_state (Dict[str, Any]): Current adapter state. Entries will be updated in-place by the method. chain_state (mici.states.ChainState): Current chain state following sampling from transition being adapted. May be used to calculate adapter state updates but should not be mutated by method. trans_stats (Dict[str, numeric]): Dictionary of statistics associated with transition being adapted. May be used to calculate adapter state updates but should not be mutated by method. transition (mici.transitions.Transition): Markov transition being adapted. Attributes of the transition or child objects may be updated in-place by the method. """
def finalize(self, adapt_state, transition)
-
Update transition parameters based on final adapter state or states.
Optionally, if multiple adapter states are available, e.g. from a set of independent adaptive chains, then these adaptation information from all the chains may be combined to set the transition parameter(s).
Args
adapt_state
:Dict
[str
,Any
] orList
[Dict
[str
,Any
]]- Final adapter state or a list of adapter states. Arrays / buffers associated with the adapter state entries may be recycled to reduce memory usage - if so the corresponding entries will be removed from the adapter state dictionary / dictionaries.
transition
:Transition
- Markov transition being adapted. Attributes of the transition or child objects will be updated in-place by the method.
Expand source code Browse git
@abstractmethod def finalize(self, adapt_state, transition): """Update transition parameters based on final adapter state or states. Optionally, if multiple adapter states are available, e.g. from a set of independent adaptive chains, then these adaptation information from all the chains may be combined to set the transition parameter(s). Args: adapt_state (Dict[str, Any] or List[Dict[str, Any]]): Final adapter state or a list of adapter states. Arrays / buffers associated with the adapter state entries may be recycled to reduce memory usage - if so the corresponding entries will be removed from the adapter state dictionary / dictionaries. transition (mici.transitions.Transition): Markov transition being adapted. Attributes of the transition or child objects will be updated in-place by the method. """
class DualAveragingStepSizeAdapter (adapt_stat_target=0.8, adapt_stat_func=None, log_step_size_reg_target=None, log_step_size_reg_coefficient=0.05, iter_decay_coeff=0.75, iter_offset=10, max_init_step_size_iters=100)
-
Dual averaging integrator step size adapter.
Implementation of the dual algorithm step size adaptation algorithm described in [1], a modified version of the stochastic optimisation scheme of [2]. By default the adaptation is performed to control the
accept_prob
statistic of an integration transition to be close to a target value but the statistic adapted on can be altered by changing theadapt_stat_func
.References
- Hoffman, M.D. and Gelman, A., 2014. The No-U-turn sampler: adaptively setting path lengths in Hamiltonian Monte Carlo. Journal of Machine Learning Research, 15(1), pp.1593-1623.
- Nesterov, Y., 2009. Primal-dual subgradient methods for convex problems. Mathematical programming 120(1), pp.221-259.
Args
adapt_stat_target
:float
- Target value for the transition statistic being controlled during adaptation.
adapt_stat_func
:Callable
[[Dict
[str
,numeric
]],numeric
]- Function which given a dictionary of transition statistics outputs the value of the statistic to control during adaptation. By default this is set to a function which simply selects the 'accept_stat' value in the statistics dictionary.
log_step_size_reg_target
:float
orNone
- Value to regularize the
controlled output (logarithm of the integrator step size)
towards. If
None
set tolog(10 * init_step_size)
whereinit_step_size
is the initial 'reasonable' step size found by a coarse search as recommended in Hoffman and Gelman (2014). This has the effect of giving the dual averaging algorithm a tendency towards testing step sizes larger than the initial value, with typically integrating with a larger step size having a lower computational cost. log_step_size_reg_coefficient
:float
- Coefficient controlling
amount of regularisation of controlled output (logarithm of the
integrator step size) towards
log_step_size_reg_target
. Defaults to 0.05 as recommended in Hoffman and Gelman (2014). iter_decay_coeff
:float
- Coefficient controlling exponent of decay in schedule weighting stochastic updates to smoothed log step size estimate. Should be in the interval (0.5, 1] to ensure asymptotic convergence of adaptation. A value of 1 gives equal weight to the whole history of updates while setting to a smaller value increasingly highly weights recent updates, giving a tendency to 'forget' early updates. Defaults to 0.75 as recommended in Hoffman and Gelman (2014).
iter_offset
:int
- Offset used for the iteration based weighting of the adaptation statistic error estimate. Should be set to a non-negative value. A value > 0 has the effect of stabilising early iterations. Defaults to the value of 10 as recommended in Hoffman and Gelman (2014).
max_init_step_size_iters
:int
- Maximum number of iterations to use
in initial search for a reasonable step size with an
AdaptationError
exception raised if a suitable step size is not found within this many iterations.
Expand source code Browse git
class DualAveragingStepSizeAdapter(Adapter): """Dual averaging integrator step size adapter. Implementation of the dual algorithm step size adaptation algorithm described in [1], a modified version of the stochastic optimisation scheme of [2]. By default the adaptation is performed to control the `accept_prob` statistic of an integration transition to be close to a target value but the statistic adapted on can be altered by changing the `adapt_stat_func`. References: 1. Hoffman, M.D. and Gelman, A., 2014. The No-U-turn sampler: adaptively setting path lengths in Hamiltonian Monte Carlo. Journal of Machine Learning Research, 15(1), pp.1593-1623. 2. Nesterov, Y., 2009. Primal-dual subgradient methods for convex problems. Mathematical programming 120(1), pp.221-259. """ def __init__(self, adapt_stat_target=0.8, adapt_stat_func=None, log_step_size_reg_target=None, log_step_size_reg_coefficient=0.05, iter_decay_coeff=0.75, iter_offset=10, max_init_step_size_iters=100): """ Args: adapt_stat_target (float): Target value for the transition statistic being controlled during adaptation. adapt_stat_func (Callable[[Dict[str, numeric]], numeric]): Function which given a dictionary of transition statistics outputs the value of the statistic to control during adaptation. By default this is set to a function which simply selects the 'accept_stat' value in the statistics dictionary. log_step_size_reg_target (float or None): Value to regularize the controlled output (logarithm of the integrator step size) towards. If `None` set to `log(10 * init_step_size)` where `init_step_size` is the initial 'reasonable' step size found by a coarse search as recommended in Hoffman and Gelman (2014). This has the effect of giving the dual averaging algorithm a tendency towards testing step sizes larger than the initial value, with typically integrating with a larger step size having a lower computational cost. log_step_size_reg_coefficient (float): Coefficient controlling amount of regularisation of controlled output (logarithm of the integrator step size) towards `log_step_size_reg_target`. Defaults to 0.05 as recommended in Hoffman and Gelman (2014). iter_decay_coeff (float): Coefficient controlling exponent of decay in schedule weighting stochastic updates to smoothed log step size estimate. Should be in the interval (0.5, 1] to ensure asymptotic convergence of adaptation. A value of 1 gives equal weight to the whole history of updates while setting to a smaller value increasingly highly weights recent updates, giving a tendency to 'forget' early updates. Defaults to 0.75 as recommended in Hoffman and Gelman (2014). iter_offset (int): Offset used for the iteration based weighting of the adaptation statistic error estimate. Should be set to a non-negative value. A value > 0 has the effect of stabilising early iterations. Defaults to the value of 10 as recommended in Hoffman and Gelman (2014). max_init_step_size_iters (int): Maximum number of iterations to use in initial search for a reasonable step size with an `AdaptationError` exception raised if a suitable step size is not found within this many iterations. """ self.adapt_stat_target = adapt_stat_target if adapt_stat_func is None: def adapt_stat_func(stats): return stats['accept_stat'] self.adapt_stat_func = adapt_stat_func self.log_step_size_reg_target = log_step_size_reg_target self.log_step_size_reg_coefficient = log_step_size_reg_coefficient self.iter_decay_coeff = iter_decay_coeff self.iter_offset = iter_offset self.max_init_step_size_iters = max_init_step_size_iters def initialize(self, chain_state, transition): integrator = transition.integrator system = transition.system adapt_state = { 'iter': 0, 'smoothed_log_step_size': 0., 'adapt_stat_error': 0., } init_step_size = (self._find_and_set_init_step_size( chain_state, system, integrator) if integrator.step_size is None else integrator.step_size) if self.log_step_size_reg_target is None: adapt_state['log_step_size_reg_target'] = log(10 * init_step_size) else: adapt_state['log_step_size_reg_target'] = ( self.log_step_size_reg_target) return adapt_state def _find_and_set_init_step_size(self, state, system, integrator): """Find initial step size by coarse search using single step statistics. Adaptation of Algorithm 4 in Hoffman and Gelman (2014). Compared to the Hoffman and Gelman algorithm, this version makes two changes: 1. The absolute value of the change in Hamiltonian over a step being larger or smaller than log(2) is used to determine whether the step size is too big or small as opposed to the value of the equivalent Metropolis accept probability being larger or smaller than 0.5. Although a negative change in the Hamiltonian over a step of magnitude more than log(2) will lead to an accept probability of 1 for the forward move, the corresponding reversed move will have an accept probability less than 0.5, and so a change in the Hamiltonian over a step of magnitude more than log(2) irrespective of the sign of the change is indicative of the minimum acceptance probability over both forward and reversed steps being less than 0.5. 2. To allow for integrators for which an integrator step may fail due to e.g. a convergence error in an iterative solver, the step size is also considered to be too big if any of the step sizes tried in the search result in a failed integrator step, with in this case the step size always being decreased on subsequent steps irrespective of the initial Hamiltonian error, until a integrator step successfully completes and the absolute value of the change in Hamiltonian is below the threshold of log(2) (corresponding to a minimum acceptance probability over forward and reversed steps of 0.5). """ init_state = state.copy() h_init = system.h(init_state) integrator.step_size = 1 delta_h_threshold = log(2) for s in range(self.max_init_step_size_iters): try: state = integrator.step(init_state) delta_h = abs(h_init - system.h(state)) if s == 0: step_size_too_big = delta_h > delta_h_threshold if (step_size_too_big and delta_h <= delta_h_threshold) or ( not step_size_too_big and delta_h > delta_h_threshold): return integrator.step_size elif step_size_too_big: integrator.step_size /= 2 else: integrator.step_size *= 2 except IntegratorError: step_size_too_big = True integrator.step_size /= 2 raise AdaptationError( f'Could not find reasonable initial step size in ' f'{self.max_init_step_size_iters} iterations (final step size ' f'{integrator.step_size}). A very large final step size may ' f'indicate that the target distribution is improper such that the ' f'negative log density is flat in one or more directions while a ' f'very small final step size may indicate that the density function' f' is insufficiently smooth at the point initialized at.') def update(self, adapt_state, chain_state, trans_stats, transition): adapt_state['iter'] += 1 error_weight = 1 / (self.iter_offset + adapt_state['iter']) adapt_state['adapt_stat_error'] *= (1 - error_weight) adapt_state['adapt_stat_error'] += error_weight * ( self.adapt_stat_target - self.adapt_stat_func(trans_stats)) smoothing_weight = (1 / adapt_state['iter'])**self.iter_decay_coeff log_step_size = adapt_state['log_step_size_reg_target'] - ( adapt_state['adapt_stat_error'] * adapt_state['iter']**0.5 / self.log_step_size_reg_coefficient) adapt_state['smoothed_log_step_size'] *= (1 - smoothing_weight) adapt_state['smoothed_log_step_size'] += ( smoothing_weight * log_step_size) transition.integrator.step_size = exp(log_step_size) def finalize(self, adapt_state, transition): if isinstance(adapt_state, dict): transition.integrator.step_size = exp( adapt_state['smoothed_log_step_size']) else: transition.integrator.step_size = sum( exp(a['smoothed_log_step_size']) for a in adapt_state) / len(adapt_state)
Ancestors
- Adapter
- abc.ABC
Methods
def initialize(self, chain_state, transition)
-
Initialize adapter state prior to starting adaptive transitions.
Args
chain_state
:ChainState
- Initial chain state adaptive transition will be started from. May be used to calculate initial adapter state but should not be mutated by method.
transition
:Transition
- Markov transition being adapted. Attributes of the transition or child objects may be updated in-place by the method.
Returns
adapt_state
:Dict
[str
,Any
]- Initial adapter state.
Expand source code Browse git
def initialize(self, chain_state, transition): integrator = transition.integrator system = transition.system adapt_state = { 'iter': 0, 'smoothed_log_step_size': 0., 'adapt_stat_error': 0., } init_step_size = (self._find_and_set_init_step_size( chain_state, system, integrator) if integrator.step_size is None else integrator.step_size) if self.log_step_size_reg_target is None: adapt_state['log_step_size_reg_target'] = log(10 * init_step_size) else: adapt_state['log_step_size_reg_target'] = ( self.log_step_size_reg_target) return adapt_state
def update(self, adapt_state, chain_state, trans_stats, transition)
-
Update adapter state after sampling transition being adapted.
Args
adapt_state
:Dict
[str
,Any
]- Current adapter state. Entries will be updated in-place by the method.
chain_state
:ChainState
- Current chain state following sampling from transition being adapted. May be used to calculate adapter state updates but should not be mutated by method.
trans_stats
:Dict
[str
,numeric
]- Dictionary of statistics associated with transition being adapted. May be used to calculate adapter state updates but should not be mutated by method.
transition
:Transition
- Markov transition being adapted. Attributes of the transition or child objects may be updated in-place by the method.
Expand source code Browse git
def update(self, adapt_state, chain_state, trans_stats, transition): adapt_state['iter'] += 1 error_weight = 1 / (self.iter_offset + adapt_state['iter']) adapt_state['adapt_stat_error'] *= (1 - error_weight) adapt_state['adapt_stat_error'] += error_weight * ( self.adapt_stat_target - self.adapt_stat_func(trans_stats)) smoothing_weight = (1 / adapt_state['iter'])**self.iter_decay_coeff log_step_size = adapt_state['log_step_size_reg_target'] - ( adapt_state['adapt_stat_error'] * adapt_state['iter']**0.5 / self.log_step_size_reg_coefficient) adapt_state['smoothed_log_step_size'] *= (1 - smoothing_weight) adapt_state['smoothed_log_step_size'] += ( smoothing_weight * log_step_size) transition.integrator.step_size = exp(log_step_size)
def finalize(self, adapt_state, transition)
-
Update transition parameters based on final adapter state or states.
Optionally, if multiple adapter states are available, e.g. from a set of independent adaptive chains, then these adaptation information from all the chains may be combined to set the transition parameter(s).
Args
adapt_state
:Dict
[str
,Any
] orList
[Dict
[str
,Any
]]- Final adapter state or a list of adapter states. Arrays / buffers associated with the adapter state entries may be recycled to reduce memory usage - if so the corresponding entries will be removed from the adapter state dictionary / dictionaries.
transition
:Transition
- Markov transition being adapted. Attributes of the transition or child objects will be updated in-place by the method.
Expand source code Browse git
def finalize(self, adapt_state, transition): if isinstance(adapt_state, dict): transition.integrator.step_size = exp( adapt_state['smoothed_log_step_size']) else: transition.integrator.step_size = sum( exp(a['smoothed_log_step_size']) for a in adapt_state) / len(adapt_state)
class OnlineVarianceMetricAdapter (reg_iter_offset=5, reg_scale=0.001)
-
Diagonal metric adapter using online variance estimates.
Uses Welford's algorithm [1] to stably compute an online estimate of the sample variances of the chain state position components during sampling. If online estimates are available from multiple independent chains, the final variance estimate is calculated from the per-chain statistics using the parallel / batched incremental variance algorithm described by Chan et al. [2]. The variance estimates are optionally regularized towards a common scalar value, with increasing weight for small number of samples, to decrease the effect of noisy estimates for small sample sizes, following the approach in Stan [3]. The metric matrix representation is set to a diagonal matrix with diagonal elements corresponding to the reciprocal of the (regularized) variance estimates.
References
- Welford, B. P., 1962. Note on a method for calculating corrected sums of squares and products. Technometrics, 4(3), pp. 419–420.
- Chan, T. F., Golub, G. H., LeVeque, R. J., 1979. Updating formulae and a pairwise algorithm for computing sample variances. Technical Report STAN-CS-79-773, Department of Computer Science, Stanford University.
- Carpenter, B., Gelman, A., Hoffman, M.D., Lee, D., Goodrich, B., Betancourt, M., Brubaker, M., Guo, J., Li, P. and Riddell, A., 2017. Stan: A probabilistic programming language. Journal of Statistical Software, 76(1).
Args
reg_iter_offset
:int
- Iteration offset used for calculating iteration dependent weighting between regularisation target and current covariance estimate. Higher values cause stronger regularisation during initial iterations. A value of zero corresponds to no regularisation; this should only be used if the sample covariance is guaranteed to be positive definite.
reg_scale
:float
- Positive scalar defining value variance estimates are regularized towards.
Expand source code Browse git
class OnlineVarianceMetricAdapter(Adapter): """Diagonal metric adapter using online variance estimates. Uses Welford's algorithm [1] to stably compute an online estimate of the sample variances of the chain state position components during sampling. If online estimates are available from multiple independent chains, the final variance estimate is calculated from the per-chain statistics using the parallel / batched incremental variance algorithm described by Chan et al. [2]. The variance estimates are optionally regularized towards a common scalar value, with increasing weight for small number of samples, to decrease the effect of noisy estimates for small sample sizes, following the approach in Stan [3]. The metric matrix representation is set to a diagonal matrix with diagonal elements corresponding to the reciprocal of the (regularized) variance estimates. References: 1. Welford, B. P., 1962. Note on a method for calculating corrected sums of squares and products. Technometrics, 4(3), pp. 419–420. 2. Chan, T. F., Golub, G. H., LeVeque, R. J., 1979. Updating formulae and a pairwise algorithm for computing sample variances. Technical Report STAN-CS-79-773, Department of Computer Science, Stanford University. 3. Carpenter, B., Gelman, A., Hoffman, M.D., Lee, D., Goodrich, B., Betancourt, M., Brubaker, M., Guo, J., Li, P. and Riddell, A., 2017. Stan: A probabilistic programming language. Journal of Statistical Software, 76(1). """ def __init__(self, reg_iter_offset=5, reg_scale=1e-3): """ Args: reg_iter_offset (int): Iteration offset used for calculating iteration dependent weighting between regularisation target and current covariance estimate. Higher values cause stronger regularisation during initial iterations. A value of zero corresponds to no regularisation; this should only be used if the sample covariance is guaranteed to be positive definite. reg_scale (float): Positive scalar defining value variance estimates are regularized towards. """ self.reg_iter_offset = reg_iter_offset self.reg_scale = reg_scale def initialize(self, chain_state, transition): return { 'iter': 0, 'mean': np.zeros_like(chain_state.pos), 'sum_diff_sq': np.zeros_like(chain_state.pos) } def update(self, adapt_state, chain_state, trans_stats, transition): # Use Welford (1962) incremental algorithm to update statistics to # calculate online variance estimate # https://en.wikipedia.org/wiki/ # Algorithms_for_calculating_variance#Welford's_online_algorithm adapt_state['iter'] += 1 pos_minus_mean = chain_state.pos - adapt_state['mean'] adapt_state['mean'] += pos_minus_mean / adapt_state['iter'] adapt_state['sum_diff_sq'] += pos_minus_mean * ( chain_state.pos - adapt_state['mean']) def _regularize_var_est(self, var_est, n_iter): """Update variance estimates by regularizing towards common scalar. Performed in place to prevent further array allocations. """ if self.reg_iter_offset is not None and self.reg_iter_offset != 0: var_est *= n_iter / (self.reg_iter_offset + n_iter) var_est += self.reg_scale * ( self.reg_iter_offset / (self.reg_iter_offset + n_iter)) def finalize(self, adapt_state, transition): if isinstance(adapt_state, dict): n_iter = adapt_state['iter'] var_est = adapt_state.pop('sum_diff_sq') else: # Use Chan et al. (1979) parallel variance estimation algorithm # to combine per-chain statistics # https://en.wikipedia.org/wiki/ # Algorithms_for_calculating_variance#Parallel_algorithm for i, a in enumerate(adapt_state): if i == 0: n_iter = a['iter'] mean_est = a.pop('mean') var_est = a.pop('sum_diff_sq') else: n_iter_prev = n_iter n_iter += a['iter'] mean_diff = mean_est - a['mean'] mean_est *= n_iter_prev mean_est += a['iter'] * a['mean'] mean_est /= n_iter var_est += a['sum_diff_sq'] var_est += mean_diff**2 * (a['iter'] * n_iter_prev) / n_iter if n_iter < 2: raise AdaptationError( 'At least two chain samples required to compute a variance ' 'estimates.') var_est /= (n_iter - 1) self._regularize_var_est(var_est, n_iter) transition.system.metric = PositiveDiagonalMatrix(var_est).inv
Ancestors
- Adapter
- abc.ABC
Methods
def initialize(self, chain_state, transition)
-
Initialize adapter state prior to starting adaptive transitions.
Args
chain_state
:ChainState
- Initial chain state adaptive transition will be started from. May be used to calculate initial adapter state but should not be mutated by method.
transition
:Transition
- Markov transition being adapted. Attributes of the transition or child objects may be updated in-place by the method.
Returns
adapt_state
:Dict
[str
,Any
]- Initial adapter state.
Expand source code Browse git
def initialize(self, chain_state, transition): return { 'iter': 0, 'mean': np.zeros_like(chain_state.pos), 'sum_diff_sq': np.zeros_like(chain_state.pos) }
def update(self, adapt_state, chain_state, trans_stats, transition)
-
Update adapter state after sampling transition being adapted.
Args
adapt_state
:Dict
[str
,Any
]- Current adapter state. Entries will be updated in-place by the method.
chain_state
:ChainState
- Current chain state following sampling from transition being adapted. May be used to calculate adapter state updates but should not be mutated by method.
trans_stats
:Dict
[str
,numeric
]- Dictionary of statistics associated with transition being adapted. May be used to calculate adapter state updates but should not be mutated by method.
transition
:Transition
- Markov transition being adapted. Attributes of the transition or child objects may be updated in-place by the method.
Expand source code Browse git
def update(self, adapt_state, chain_state, trans_stats, transition): # Use Welford (1962) incremental algorithm to update statistics to # calculate online variance estimate # https://en.wikipedia.org/wiki/ # Algorithms_for_calculating_variance#Welford's_online_algorithm adapt_state['iter'] += 1 pos_minus_mean = chain_state.pos - adapt_state['mean'] adapt_state['mean'] += pos_minus_mean / adapt_state['iter'] adapt_state['sum_diff_sq'] += pos_minus_mean * ( chain_state.pos - adapt_state['mean'])
def finalize(self, adapt_state, transition)
-
Update transition parameters based on final adapter state or states.
Optionally, if multiple adapter states are available, e.g. from a set of independent adaptive chains, then these adaptation information from all the chains may be combined to set the transition parameter(s).
Args
adapt_state
:Dict
[str
,Any
] orList
[Dict
[str
,Any
]]- Final adapter state or a list of adapter states. Arrays / buffers associated with the adapter state entries may be recycled to reduce memory usage - if so the corresponding entries will be removed from the adapter state dictionary / dictionaries.
transition
:Transition
- Markov transition being adapted. Attributes of the transition or child objects will be updated in-place by the method.
Expand source code Browse git
def finalize(self, adapt_state, transition): if isinstance(adapt_state, dict): n_iter = adapt_state['iter'] var_est = adapt_state.pop('sum_diff_sq') else: # Use Chan et al. (1979) parallel variance estimation algorithm # to combine per-chain statistics # https://en.wikipedia.org/wiki/ # Algorithms_for_calculating_variance#Parallel_algorithm for i, a in enumerate(adapt_state): if i == 0: n_iter = a['iter'] mean_est = a.pop('mean') var_est = a.pop('sum_diff_sq') else: n_iter_prev = n_iter n_iter += a['iter'] mean_diff = mean_est - a['mean'] mean_est *= n_iter_prev mean_est += a['iter'] * a['mean'] mean_est /= n_iter var_est += a['sum_diff_sq'] var_est += mean_diff**2 * (a['iter'] * n_iter_prev) / n_iter if n_iter < 2: raise AdaptationError( 'At least two chain samples required to compute a variance ' 'estimates.') var_est /= (n_iter - 1) self._regularize_var_est(var_est, n_iter) transition.system.metric = PositiveDiagonalMatrix(var_est).inv
class OnlineCovarianceMetricAdapter (reg_iter_offset=5, reg_scale=0.001)
-
Dense metric adapter using online covariance estimates.
Uses Welford's algorithm [1] to stably compute an online estimate of the sample covariane matrix of the chain state position components during sampling. If online estimates are available from multiple independent chains, the final covariance matrix estimate is calculated from the per-chain statistics using a covariance variant due to Schubert and Gertz [2] of the parallel / batched incremental variance algorithm described by Chan et al. [3]. The covariance matrix estimates are optionally regularized towards a scaled identity matrix, with increasing weight for small number of samples, to decrease the effect of noisy estimates for small sample sizes, following the approach in Stan [4]. The metric matrix representation is set to a dense positive definite matrix corresponding to the inverse of the (regularized) covariance matrix estimate.
References
- Welford, B. P., 1962. Note on a method for calculating corrected sums of squares and products. Technometrics, 4(3), pp. 419–420.
- Schubert, E. and Gertz, M., 2018. Numerically stable parallel computation of (co-)variance. ACM. p. 10. doi:10.1145/3221269.3223036.
- Chan, T. F., Golub, G. H., LeVeque, R. J., 1979. Updating formulae and a pairwise algorithm for computing sample variances. Technical Report STAN-CS-79-773, Department of Computer Science, Stanford University.
- Carpenter, B., Gelman, A., Hoffman, M.D., Lee, D., Goodrich, B., Betancourt, M., Brubaker, M., Guo, J., Li, P. and Riddell, A., 2017. Stan: A probabilistic programming language. Journal of Statistical Software, 76(1).
Args
reg_iter_offset
:int
- Iteration offset used for calculating iteration dependent weighting between regularisation target and current covariance estimate. Higher values cause stronger regularisation during initial iterations.
reg_scale
:float
- Positive scalar defining value variance estimates are regularized towards.
Expand source code Browse git
class OnlineCovarianceMetricAdapter(Adapter): """Dense metric adapter using online covariance estimates. Uses Welford's algorithm [1] to stably compute an online estimate of the sample covariane matrix of the chain state position components during sampling. If online estimates are available from multiple independent chains, the final covariance matrix estimate is calculated from the per-chain statistics using a covariance variant due to Schubert and Gertz [2] of the parallel / batched incremental variance algorithm described by Chan et al. [3]. The covariance matrix estimates are optionally regularized towards a scaled identity matrix, with increasing weight for small number of samples, to decrease the effect of noisy estimates for small sample sizes, following the approach in Stan [4]. The metric matrix representation is set to a dense positive definite matrix corresponding to the inverse of the (regularized) covariance matrix estimate. References: 1. Welford, B. P., 1962. Note on a method for calculating corrected sums of squares and products. Technometrics, 4(3), pp. 419–420. 2. Schubert, E. and Gertz, M., 2018. Numerically stable parallel computation of (co-)variance. ACM. p. 10. doi:10.1145/3221269.3223036. 3. Chan, T. F., Golub, G. H., LeVeque, R. J., 1979. Updating formulae and a pairwise algorithm for computing sample variances. Technical Report STAN-CS-79-773, Department of Computer Science, Stanford University. 4. Carpenter, B., Gelman, A., Hoffman, M.D., Lee, D., Goodrich, B., Betancourt, M., Brubaker, M., Guo, J., Li, P. and Riddell, A., 2017. Stan: A probabilistic programming language. Journal of Statistical Software, 76(1). """ def __init__(self, reg_iter_offset=5, reg_scale=1e-3): """ Args: reg_iter_offset (int): Iteration offset used for calculating iteration dependent weighting between regularisation target and current covariance estimate. Higher values cause stronger regularisation during initial iterations. reg_scale (float): Positive scalar defining value variance estimates are regularized towards. """ self.reg_iter_offset = reg_iter_offset self.reg_scale = reg_scale def initialize(self, chain_state, transition): dim_pos = chain_state.pos.shape[0] dtype = chain_state.pos.dtype return { 'iter': 0, 'mean': np.zeros(shape=(dim_pos,), dtype=dtype), 'sum_diff_outer': np.zeros(shape=(dim_pos, dim_pos), dtype=dtype) } def update(self, adapt_state, chain_state, trans_stats, transition): # Use Welford (1962) incremental algorithm to update statistics to # calculate online covariance estimate # https://en.wikipedia.org/wiki/ # Algorithms_for_calculating_variance#Online adapt_state['iter'] += 1 pos_minus_mean = chain_state.pos - adapt_state['mean'] adapt_state['mean'] += pos_minus_mean / adapt_state['iter'] adapt_state['sum_diff_outer'] += pos_minus_mean[None, :] * ( chain_state.pos - adapt_state['mean'])[:, None] def _regularize_covar_est(self, covar_est, n_iter): """Update covariance estimate by regularising towards identity. Performed in place to prevent further array allocations. """ covar_est *= (n_iter / (self.reg_iter_offset + n_iter)) covar_est_diagonal = np.einsum('ii->i', covar_est) covar_est_diagonal += self.reg_scale * ( self.reg_iter_offset / (self.reg_iter_offset + n_iter)) def finalize(self, adapt_state, transition): if isinstance(adapt_state, dict): n_iter = adapt_state['iter'] covar_est = adapt_state.pop('sum_diff_outer') else: # Use Schubert and Gertz (2018) parallel covariance estimation # algorithm to combine per-chain statistics for i, a in enumerate(adapt_state): if i == 0: n_iter = a['iter'] mean_est = a.pop('mean') covar_est = a.pop('sum_diff_outer') else: n_iter_prev = n_iter n_iter += a['iter'] mean_diff = mean_est - a['mean'] mean_est *= n_iter_prev mean_est += a['iter'] * a['mean'] mean_est /= n_iter covar_est += a['sum_diff_outer'] covar_est += np.outer(mean_diff, mean_diff) * ( a['iter'] * n_iter_prev) / n_iter if n_iter < 2: raise AdaptationError( 'At least two chain samples required to compute a variance ' 'estimates.') covar_est /= (n_iter - 1) self._regularize_covar_est(covar_est, n_iter) transition.system.metric = DensePositiveDefiniteMatrix(covar_est).inv
Ancestors
- Adapter
- abc.ABC
Methods
def initialize(self, chain_state, transition)
-
Initialize adapter state prior to starting adaptive transitions.
Args
chain_state
:ChainState
- Initial chain state adaptive transition will be started from. May be used to calculate initial adapter state but should not be mutated by method.
transition
:Transition
- Markov transition being adapted. Attributes of the transition or child objects may be updated in-place by the method.
Returns
adapt_state
:Dict
[str
,Any
]- Initial adapter state.
Expand source code Browse git
def initialize(self, chain_state, transition): dim_pos = chain_state.pos.shape[0] dtype = chain_state.pos.dtype return { 'iter': 0, 'mean': np.zeros(shape=(dim_pos,), dtype=dtype), 'sum_diff_outer': np.zeros(shape=(dim_pos, dim_pos), dtype=dtype) }
def update(self, adapt_state, chain_state, trans_stats, transition)
-
Update adapter state after sampling transition being adapted.
Args
adapt_state
:Dict
[str
,Any
]- Current adapter state. Entries will be updated in-place by the method.
chain_state
:ChainState
- Current chain state following sampling from transition being adapted. May be used to calculate adapter state updates but should not be mutated by method.
trans_stats
:Dict
[str
,numeric
]- Dictionary of statistics associated with transition being adapted. May be used to calculate adapter state updates but should not be mutated by method.
transition
:Transition
- Markov transition being adapted. Attributes of the transition or child objects may be updated in-place by the method.
Expand source code Browse git
def update(self, adapt_state, chain_state, trans_stats, transition): # Use Welford (1962) incremental algorithm to update statistics to # calculate online covariance estimate # https://en.wikipedia.org/wiki/ # Algorithms_for_calculating_variance#Online adapt_state['iter'] += 1 pos_minus_mean = chain_state.pos - adapt_state['mean'] adapt_state['mean'] += pos_minus_mean / adapt_state['iter'] adapt_state['sum_diff_outer'] += pos_minus_mean[None, :] * ( chain_state.pos - adapt_state['mean'])[:, None]
def finalize(self, adapt_state, transition)
-
Update transition parameters based on final adapter state or states.
Optionally, if multiple adapter states are available, e.g. from a set of independent adaptive chains, then these adaptation information from all the chains may be combined to set the transition parameter(s).
Args
adapt_state
:Dict
[str
,Any
] orList
[Dict
[str
,Any
]]- Final adapter state or a list of adapter states. Arrays / buffers associated with the adapter state entries may be recycled to reduce memory usage - if so the corresponding entries will be removed from the adapter state dictionary / dictionaries.
transition
:Transition
- Markov transition being adapted. Attributes of the transition or child objects will be updated in-place by the method.
Expand source code Browse git
def finalize(self, adapt_state, transition): if isinstance(adapt_state, dict): n_iter = adapt_state['iter'] covar_est = adapt_state.pop('sum_diff_outer') else: # Use Schubert and Gertz (2018) parallel covariance estimation # algorithm to combine per-chain statistics for i, a in enumerate(adapt_state): if i == 0: n_iter = a['iter'] mean_est = a.pop('mean') covar_est = a.pop('sum_diff_outer') else: n_iter_prev = n_iter n_iter += a['iter'] mean_diff = mean_est - a['mean'] mean_est *= n_iter_prev mean_est += a['iter'] * a['mean'] mean_est /= n_iter covar_est += a['sum_diff_outer'] covar_est += np.outer(mean_diff, mean_diff) * ( a['iter'] * n_iter_prev) / n_iter if n_iter < 2: raise AdaptationError( 'At least two chain samples required to compute a variance ' 'estimates.') covar_est /= (n_iter - 1) self._regularize_covar_est(covar_est, n_iter) transition.system.metric = DensePositiveDefiniteMatrix(covar_est).inv