Stan  2.10.0
probability, sampling & optimization
advi.hpp
Go to the documentation of this file.
1 #ifndef STAN_VARIATIONAL_ADVI_HPP
2 #define STAN_VARIATIONAL_ADVI_HPP
3 
4 #include <stan/math.hpp>
7 #include <stan/io/dump.hpp>
8 #include <stan/model/util.hpp>
14 #include <boost/circular_buffer.hpp>
15 #include <boost/lexical_cast.hpp>
16 #include <algorithm>
17 #include <limits>
18 #include <numeric>
19 #include <ostream>
20 #include <vector>
21 #include <queue>
22 #include <string>
23 
24 namespace stan {
25 
26  namespace variational {
27 
39  template <class Model, class Q, class BaseRNG>
40  class advi {
41  public:
57  advi(Model& m,
58  Eigen::VectorXd& cont_params,
59  BaseRNG& rng,
60  int n_monte_carlo_grad,
61  int n_monte_carlo_elbo,
62  int eval_elbo,
63  int n_posterior_samples)
64  : model_(m),
65  cont_params_(cont_params),
66  rng_(rng),
67  n_monte_carlo_grad_(n_monte_carlo_grad),
68  n_monte_carlo_elbo_(n_monte_carlo_elbo),
69  eval_elbo_(eval_elbo),
70  n_posterior_samples_(n_posterior_samples) {
71  static const char* function = "stan::variational::advi";
72  math::check_positive(function,
73  "Number of Monte Carlo samples for gradients",
75  math::check_positive(function,
76  "Number of Monte Carlo samples for ELBO",
78  math::check_positive(function,
79  "Evaluate ELBO at every eval_elbo iteration",
80  eval_elbo_);
81  math::check_positive(function,
82  "Number of posterior samples for output",
84  }
85 
100  double calc_ELBO(const Q& variational,
102  const {
103  static const char* function =
104  "stan::variational::advi::calc_ELBO";
105 
106  double elbo = 0.0;
107  int dim = variational.dimension();
108  Eigen::VectorXd zeta(dim);
109 
110  int n_dropped_evaluations = 0;
111  for (int i = 0; i < n_monte_carlo_elbo_;) {
112  variational.sample(rng_, zeta);
113  try {
114  std::stringstream ss;
115  double log_prob = model_.template log_prob<false, true>(zeta, &ss);
116  if (ss.str().length() > 0)
117  message_writer(ss.str());
118  stan::math::check_finite(function, "log_prob", log_prob);
119  elbo += log_prob;
120  ++i;
121  } catch (const std::domain_error& e) {
122  ++n_dropped_evaluations;
123  if (n_dropped_evaluations >= n_monte_carlo_elbo_) {
124  const char* name = "The number of dropped evaluations";
125  const char* msg1 = "has reached its maximum amount (";
126  const char* msg2 = "). Your model may be either severely "
127  "ill-conditioned or misspecified.";
128  stan::math::domain_error(function, name, n_monte_carlo_elbo_,
129  msg1, msg2);
130  }
131  }
132  }
133  elbo /= n_monte_carlo_elbo_;
134  elbo += variational.entropy();
135  return elbo;
136  }
137 
147  void calc_ELBO_grad(const Q& variational, Q& elbo_grad,
149  message_writer)
150  const {
151  static const char* function =
152  "stan::variational::advi::calc_ELBO_grad";
153 
154  stan::math::check_size_match(function,
155  "Dimension of elbo_grad",
156  elbo_grad.dimension(),
157  "Dimension of variational q",
158  variational.dimension());
159  stan::math::check_size_match(function,
160  "Dimension of variational q",
161  variational.dimension(),
162  "Dimension of variables in model",
163  cont_params_.size());
164 
165  variational.calc_grad(elbo_grad,
167  message_writer);
168  }
169 
182  double adapt_eta(Q& variational,
183  int adapt_iterations,
185  const {
186  static const char* function = "stan::variational::advi::adapt_eta";
187 
188  stan::math::check_positive(function,
189  "Number of adaptation iterations",
190  adapt_iterations);
191 
192  message_writer("Begin eta adaptation.");
193 
194  // Sequence of eta values to try during adaptation
195  const int eta_sequence_size = 5;
196  double eta_sequence[eta_sequence_size] = {100, 10, 1, 0.1, 0.01};
197 
198  // Initialize ELBO tracking variables
199  double elbo = -std::numeric_limits<double>::max();
200  double elbo_best = -std::numeric_limits<double>::max();
201  double elbo_init;
202  try {
203  elbo_init = calc_ELBO(variational, message_writer);
204  } catch (const std::domain_error& e) {
205  const char* name = "Cannot compute ELBO using the initial "
206  "variational distribution.";
207  const char* msg1 = "Your model may be either "
208  "severely ill-conditioned or misspecified.";
209  stan::math::domain_error(function, name, "", msg1);
210  }
211 
212  // Variational family to store gradients
213  Q elbo_grad = Q(model_.num_params_r());
214 
215  // Adaptive step-size sequence
216  Q history_grad_squared = Q(model_.num_params_r());
217  double tau = 1.0;
218  double pre_factor = 0.9;
219  double post_factor = 0.1;
220 
221  double eta_best = 0.0;
222  double eta;
223  double eta_scaled;
224 
225  bool do_more_tuning = true;
226  int eta_sequence_index = 0;
227  while (do_more_tuning) {
228  // Try next eta
229  eta = eta_sequence[eta_sequence_index];
230 
231  int print_progress_m;
232  for (int iter_tune = 1; iter_tune <= adapt_iterations; ++iter_tune) {
233  print_progress_m = eta_sequence_index
234  * adapt_iterations + iter_tune;
236  ::print_progress(print_progress_m, 0,
237  adapt_iterations * eta_sequence_size,
238  adapt_iterations, true, "", "", message_writer);
239 
240  // (ROBUST) Compute gradient of ELBO. It's OK if it diverges.
241  // We'll try a smaller eta.
242  try {
243  calc_ELBO_grad(variational, elbo_grad, message_writer);
244  } catch (const std::domain_error& e) {
245  elbo_grad.set_to_zero();
246  }
247 
248  // Update step-size
249  if (iter_tune == 1) {
250  history_grad_squared += elbo_grad.square();
251  } else {
252  history_grad_squared = pre_factor * history_grad_squared
253  + post_factor * elbo_grad.square();
254  }
255  eta_scaled = eta / sqrt(static_cast<double>(iter_tune));
256  // Stochastic gradient update
257  variational += eta_scaled * elbo_grad
258  / (tau + history_grad_squared.sqrt());
259  }
260 
261  // (ROBUST) Compute ELBO. It's OK if it has diverged.
262  try {
263  elbo = calc_ELBO(variational, message_writer);
264  } catch (const std::domain_error& e) {
265  elbo = -std::numeric_limits<double>::max();
266  }
267 
268  // Check if:
269  // (1) ELBO at current eta is worse than the best ELBO
270  // (2) the best ELBO hasn't gotten worse than the initial ELBO
271  if (elbo < elbo_best && elbo_best > elbo_init) {
272  std::stringstream ss;
273  ss << "Success!"
274  << " Found best value [eta = " << eta_best
275  << "]";
276  if (eta_sequence_index < eta_sequence_size - 1)
277  ss << (" earlier than expected.");
278  else
279  ss << ".";
280  message_writer(ss.str());
281  message_writer();
282  do_more_tuning = false;
283  } else {
284  if (eta_sequence_index < eta_sequence_size - 1) {
285  // Reset
286  elbo_best = elbo;
287  eta_best = eta;
288  } else {
289  // No more eta values to try, so use current eta if it
290  // didn't diverge or fail if it did diverge
291  if (elbo > elbo_init) {
292  std::stringstream ss;
293  ss << "Success!"
294  << " Found best value [eta = " << eta_best
295  << "].";
296  message_writer(ss.str());
297  message_writer();
298  eta_best = eta;
299  do_more_tuning = false;
300  } else {
301  const char* name = "All proposed step-sizes";
302  const char* msg1 = "failed. Your model may be either "
303  "severely ill-conditioned or misspecified.";
304  stan::math::domain_error(function, name, "", msg1);
305  }
306  }
307  // Reset
308  history_grad_squared.set_to_zero();
309  }
310  ++eta_sequence_index;
311  variational = Q(cont_params_);
312  }
313  return eta_best;
314  }
315 
328  void stochastic_gradient_ascent(Q& variational,
329  double eta,
330  double tol_rel_obj,
331  int max_iterations,
333  interface_callbacks::writer::base_writer& diagnostic_writer)
334  const {
335  static const char* function =
336  "stan::variational::advi::stochastic_gradient_ascent";
337 
338  stan::math::check_positive(function, "Eta stepsize", eta);
339  stan::math::check_positive(function,
340  "Relative objective function tolerance",
341  tol_rel_obj);
342  stan::math::check_positive(function,
343  "Maximum iterations",
344  max_iterations);
345 
346  // Gradient parameters
347  Q elbo_grad = Q(model_.num_params_r());
348 
349  // Stepsize sequence parameters
350  Q history_grad_squared = Q(model_.num_params_r());
351  double tau = 1.0;
352  double pre_factor = 0.9;
353  double post_factor = 0.1;
354  double eta_scaled;
355 
356  // Initialize ELBO and convergence tracking variables
357  double elbo(0.0);
358  double elbo_best = -std::numeric_limits<double>::max();
359  double elbo_prev = -std::numeric_limits<double>::max();
360  double delta_elbo = std::numeric_limits<double>::max();
361  double delta_elbo_ave = std::numeric_limits<double>::max();
362  double delta_elbo_med = std::numeric_limits<double>::max();
363 
364  // Heuristic to estimate how far to look back in rolling window
365  int cb_size
366  = static_cast<int>(std::max(0.1 * max_iterations / eval_elbo_,
367  2.0));
368  boost::circular_buffer<double> elbo_diff(cb_size);
369 
370  message_writer("Begin stochastic gradient ascent.");
371  message_writer(" iter"
372  " ELBO"
373  " delta_ELBO_mean"
374  " delta_ELBO_med"
375  " notes ");
376 
377  // Timing variables
378  clock_t start = clock();
379  clock_t end;
380  double delta_t;
381 
382  // Main loop
383  bool do_more_iterations = true;
384  for (int iter_counter = 1; do_more_iterations; ++iter_counter) {
385  // Compute gradient using Monte Carlo integration
386  calc_ELBO_grad(variational, elbo_grad, message_writer);
387 
388  // Update step-size
389  if (iter_counter == 1) {
390  history_grad_squared += elbo_grad.square();
391  } else {
392  history_grad_squared = pre_factor * history_grad_squared
393  + post_factor * elbo_grad.square();
394  }
395  eta_scaled = eta / sqrt(static_cast<double>(iter_counter));
396 
397  // Stochastic gradient update
398  variational += eta_scaled * elbo_grad
399  / (tau + history_grad_squared.sqrt());
400 
401  // Check for convergence every "eval_elbo_"th iteration
402  if (iter_counter % eval_elbo_ == 0) {
403  elbo_prev = elbo;
404  elbo = calc_ELBO(variational, message_writer);
405  if (elbo > elbo_best)
406  elbo_best = elbo;
407  delta_elbo = rel_difference(elbo, elbo_prev);
408  elbo_diff.push_back(delta_elbo);
409  delta_elbo_ave = std::accumulate(elbo_diff.begin(),
410  elbo_diff.end(), 0.0)
411  / static_cast<double>(elbo_diff.size());
412  delta_elbo_med = circ_buff_median(elbo_diff);
413  std::stringstream ss;
414  ss << " "
415  << std::setw(4) << iter_counter
416  << " "
417  << std::right << std::setw(9) << std::setprecision(1)
418  << elbo
419  << " "
420  << std::setw(16) << std::fixed << std::setprecision(3)
421  << delta_elbo_ave
422  << " "
423  << std::setw(15) << std::fixed << std::setprecision(3)
424  << delta_elbo_med;
425 
426  end = clock();
427  delta_t = static_cast<double>(end - start) / CLOCKS_PER_SEC;
428 
429  std::vector<double> print_vector;
430  print_vector.clear();
431  print_vector.push_back(iter_counter);
432  print_vector.push_back(delta_t);
433  print_vector.push_back(elbo);
434  diagnostic_writer(print_vector);
435 
436  if (delta_elbo_ave < tol_rel_obj) {
437  ss << " MEAN ELBO CONVERGED";
438  do_more_iterations = false;
439  }
440 
441  if (delta_elbo_med < tol_rel_obj) {
442  ss << " MEDIAN ELBO CONVERGED";
443  do_more_iterations = false;
444  }
445 
446  if (iter_counter > 10 * eval_elbo_) {
447  if (delta_elbo_med > 0.5 || delta_elbo_ave > 0.5) {
448  ss << " MAY BE DIVERGING... INSPECT ELBO";
449  }
450  }
451 
452  message_writer(ss.str());
453 
454  if (do_more_iterations == false &&
455  rel_difference(elbo, elbo_best) > 0.05) {
456  message_writer("Informational Message: The ELBO at a previous "
457  "iteration is larger than the ELBO upon "
458  "convergence!");
459  message_writer("This variational approximation may not "
460  "have converged to a good optimum.");
461  }
462  }
463 
464  if (iter_counter == max_iterations) {
465  message_writer("Informational Message: The maximum number of "
466  "iterations is reached! The algorithm may not have "
467  "converged.");
468  message_writer("This variational approximation is not "
469  "guaranteed to be meaningful.");
470  do_more_iterations = false;
471  }
472  }
473  }
474 
487  int run(double eta, bool adapt_engaged, int adapt_iterations,
488  double tol_rel_obj, int max_iterations,
491  interface_callbacks::writer::base_writer& diagnostic_writer)
492  const {
493  diagnostic_writer("iter,time_in_seconds,ELBO");
494 
495  // Initialize variational approximation
496  Q variational = Q(cont_params_);
497 
498  if (adapt_engaged) {
499  eta = adapt_eta(variational, adapt_iterations, message_writer);
500  parameter_writer("Stepsize adaptation complete.");
501  std::stringstream ss;
502  ss << "eta = " << eta;
503  parameter_writer(ss.str());
504  }
505 
506  stochastic_gradient_ascent(variational, eta,
507  tol_rel_obj, max_iterations,
508  message_writer, diagnostic_writer);
509 
510  // Write mean of posterior approximation on first output line
511  cont_params_ = variational.mean();
512  std::vector<double> cont_vector(cont_params_.size());
513  for (int i = 0; i < cont_params_.size(); ++i)
514  cont_vector.at(i) = cont_params_(i);
515  std::vector<int> disc_vector;
516 
518  0, cont_vector, disc_vector,
519  message_writer,
520  parameter_writer);
521  // Draw more samples from posterior and write on subsequent lines
522  message_writer();
523  std::stringstream ss;
524  ss << "Drawing a sample of size "
526  << " from the approximate posterior... ";
527  message_writer(ss.str());
528 
529  for (int n = 0; n < n_posterior_samples_; ++n) {
530  variational.sample(rng_, cont_params_);
531  for (int i = 0; i < cont_params_.size(); ++i) {
532  cont_vector.at(i) = cont_params_(i);
533  }
535  0, cont_vector, disc_vector,
536  message_writer,
537  parameter_writer);
538  }
539  message_writer("COMPLETED.");
540 
542  }
543 
544  // TODO(akucukelbir): move these things to stan math and test there
545 
552  double circ_buff_median(const boost::circular_buffer<double>& cb) const {
553  // FIXME: naive implementation; creates a copy as a vector
554  std::vector<double> v;
555  for (boost::circular_buffer<double>::const_iterator i = cb.begin();
556  i != cb.end(); ++i) {
557  v.push_back(*i);
558  }
559 
560  size_t n = v.size() / 2;
561  std::nth_element(v.begin(), v.begin()+n, v.end());
562  return v[n];
563  }
564 
572  double rel_difference(double prev, double curr) const {
573  return std::fabs((curr - prev) / prev);
574  }
575 
576  protected:
577  Model& model_;
578  Eigen::VectorXd& cont_params_;
579  BaseRNG& rng_;
584  };
585  } // variational
586 } // stan
587 
588 #endif
Automatic Differentiation Variational Inference.
Definition: advi.hpp:40
Probability, optimization and sampling library.
double adapt_eta(Q &variational, int adapt_iterations, interface_callbacks::writer::base_writer &message_writer) const
Heuristic grid search to adapt eta to the scale of the problem.
Definition: advi.hpp:182
void write_iteration(Model &model, RNG &base_rng, double lp, std::vector< double > &cont_vector, std::vector< int > &disc_vector, interface_callbacks::writer::base_writer &message_writer, interface_callbacks::writer::base_writer &parameter_writer)
void calc_ELBO_grad(const Q &variational, Q &elbo_grad, interface_callbacks::writer::base_writer &message_writer) const
Calculates the "black box" gradient of the ELBO.
Definition: advi.hpp:147
base_writer is an abstract base class defining the interface for Stan writer callbacks.
Definition: base_writer.hpp:20
int run(double eta, bool adapt_engaged, int adapt_iterations, double tol_rel_obj, int max_iterations, interface_callbacks::writer::base_writer &message_writer, interface_callbacks::writer::base_writer &parameter_writer, interface_callbacks::writer::base_writer &diagnostic_writer) const
Runs ADVI and writes to output.
Definition: advi.hpp:487
advi(Model &m, Eigen::VectorXd &cont_params, BaseRNG &rng, int n_monte_carlo_grad, int n_monte_carlo_elbo, int eval_elbo, int n_posterior_samples)
Constructor.
Definition: advi.hpp:57
double rel_difference(double prev, double curr) const
Compute the relative difference between two double values.
Definition: advi.hpp:572
void stochastic_gradient_ascent(Q &variational, double eta, double tol_rel_obj, int max_iterations, interface_callbacks::writer::base_writer &message_writer, interface_callbacks::writer::base_writer &diagnostic_writer) const
Runs stochastic gradient ascent with an adaptive stepsize sequence.
Definition: advi.hpp:328
double circ_buff_median(const boost::circular_buffer< double > &cb) const
Compute the median of a circular buffer.
Definition: advi.hpp:552
Eigen::VectorXd & cont_params_
Definition: advi.hpp:578
void print_progress(int m, int start, int finish, int refresh, bool tune, const std::string &prefix, const std::string &suffix, interface_callbacks::writer::base_writer &writer)
Helper function for printing progress for variational inference.
double calc_ELBO(const Q &variational, interface_callbacks::writer::base_writer &message_writer) const
Calculates the Evidence Lower BOund (ELBO) by sampling from the variational distribution and then eva...
Definition: advi.hpp:100

     [ Stan Home Page ] © 2011–2016, Stan Development Team.