python - 具有Tensorflow概率的贝叶斯线性回归

我无法使贝叶斯线性回归与Tensorflow概率一起使用。这是我的代码：

!pip install tensorflow==2.0.0-rc1
!pip install tensorflow-probability==0.8.0rc0

import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
tfd = tfp.distributions
N = 20
std = 1
m = np.random.normal(0, scale=5, size=1).astype(np.float32)
b = np.random.normal(0, scale=5, size=1).astype(np.float32)
x = np.linspace(0, 100, N).astype(np.float32)
y = m*x+b+ np.random.normal(loc=0, scale=std, size=N).astype(np.float32)

num_results = 10000
num_burnin_steps = 5000

def joint_log_prob(x, y, m, b, std):
  rv_m = tfd.Normal(loc=0, scale=5)
  rv_b = tfd.Normal(loc=0, scale=5)
  rv_std = tfd.HalfCauchy(loc=0., scale=2.)

  y_mu = m*x+b
  rv_y = tfd.Normal(loc=y_mu, scale=std)

  return (rv_m.log_prob(m) + rv_b.log_prob(b) + rv_std.log_prob(std)
          + tf.reduce_sum(rv_y.log_prob(y)))

# Define a closure over our joint_log_prob.
def target_log_prob_fn(m, b, std):
    return joint_log_prob(x, y, m, b, std)

@tf.function(autograph=False)
def do_sampling():
  kernel=tfp.mcmc.HamiltonianMonteCarlo(
          target_log_prob_fn=target_log_prob_fn,
          step_size=0.05,
          num_leapfrog_steps=3)
  kernel = tfp.mcmc.SimpleStepSizeAdaptation(
        inner_kernel=kernel, num_adaptation_steps=int(num_burnin_steps * 0.8))
  return tfp.mcmc.sample_chain(
      num_results=num_results,
      num_burnin_steps=num_burnin_steps,
      current_state=[
          0.01 * tf.ones([], name='init_m', dtype=tf.float32),
          0.01 * tf.ones([], name='init_b', dtype=tf.float32),
          1 * tf.ones([], name='init_std', dtype=tf.float32)
      ],
      kernel=kernel,
      trace_fn=lambda _, pkr: [pkr.inner_results.accepted_results.step_size,
                               pkr.inner_results.log_accept_ratio])

samples, [step_size, log_accept_ratio] = do_sampling()
m_posterior, b_posterior, std_posterior = samples

p_accept = tf.reduce_mean(tf.exp(tf.minimum(log_accept_ratio, 0.)))
print('Acceptance rate: {}'.format(p_accept))

n_v = len(samples)
true_values = [m, b, std]

plt.figure()
plt.title('Training data')
plt.plot(x, y)

plt.figure()
plt.title('Visualizing trace and posterior distributions')
for i, (sample, true_value) in enumerate(zip(samples, true_values)):
  plt.subplot(2*n_v, 2, 2*i+1)
  plt.plot(sample)
  plt.subplot(2*n_v, 2, 2*i+2)
  plt.hist(sample)
  plt.axvline(x=true_value)
>>> Acceptance rate: 0.006775229703634977

有任何想法吗？

最佳答案

如此令人惊讶的问题真是令人惊讶！原始HMC在设置推理时可能对相对较小的细节极为敏感。像Stan这样的系统会尝试通过为您做很多调整来解决这个问题，但是到目前为止，TFP的自动调整已经变得更加基础。

我发现一些更改似乎在这里使推理工作正常。简而言之，它们是：

在不受限制的空间中重新参数化。
使用不均匀的步长（等效：添加对角线预处理器）
增加HMC跳过步骤的数量。
使用DualAveragingStepSizeAdaptation代替SimpleStepSizeAdaptation。

第一个技巧是使用TransformedTransitionKernel重新参数化，以便scale参数位于不受约束的空间中。例如，我们可以使用Exp射手定义对数刻度参数：

tfb = tfp.bijectors
kernel = tfp.mcmc.TransformedTransitionKernel(
      inner_kernel=kernel,
      bijector=[tfb.Identity(), tfb.Identity(), tfb.Exp()]
  )

这样可以确保推理仅考虑比例尺的正值，因此不必拒绝使比例尺低于零的所有动作。当我这样做时，尽管混合效果仍然不佳，但接受率却上升了。

第二个变化是对三个变量使用不均匀的步长（这等效于对角线预处理）。看起来此模型中的后部条件不好：与确定截距或比例相比，二十个数据点确定坡度要精确得多。 TFP中的“步长调整”只是找到一个步长，以使指定百分比的样本可以被接受，这通常由后验中受最严格约束的分量控制：如果其他分量具有更宽的后验，则小步长将阻止他们从混合。估算合理步长的一种方法是使用来自变量推断的标准偏差和正态替代后验因子：

surrogate_posterior = tfp.experimental.vi.build_factored_surrogate_posterior(
    event_shape=[[], [], []],
    constraining_bijectors=[None, None, tfb.Exp()])
losses = tfp.vi.fit_surrogate_posterior(
    target_log_prob_fn, surrogate_posterior,
    optimizer=tf.optimizers.Adam(
        learning_rate=0.1,
        # Decay second-moment estimates to aid optimizing scale parameters.
        beta_2=0.9),
    num_steps=1000)
approximate_posterior_stddevs = [np.std(x) for x in surrogate_posterior.sample(50)]

另一个通用技巧是增加越级步数。考虑HMC的一种方法是，在跨越式集成器中，它类似于具有动量的优化器，但是每次停止接受/拒绝时，它都会失去动量（通过重新采样）。因此，在极端情况下，我们每一个步骤（num_leapfrog_steps=1，即Langevin动力学）都没有动量，并且增加跳越步骤的数量往往会提高导航棘手几何的能力，这与动量如何改善优化器相似。我没有严格调整任何内容，但是在这里设置num_leapfrog_steps=16而不是3似乎很有帮助。

这是我修改后的代码版本，结合了这些技巧。在大多数执行中，它似乎混合得很好（尽管我确定它并不完美）：

!pip install tensorflow==2.0.0-rc1
!pip install tensorflow-probability==0.8.0rc0

import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp

tfd = tfp.distributions
tfb = tfp.bijectors

N = 20
std = 1
m = np.random.normal(0, scale=5, size=1).astype(np.float32)
b = np.random.normal(0, scale=5, size=1).astype(np.float32)
x = np.linspace(0, 100, N).astype(np.float32)
y = m*x+b+ np.random.normal(loc=0, scale=std, size=N).astype(np.float32)

num_results = 1000
num_burnin_steps = 500

def joint_log_prob(x, y, m, b, std):
  rv_m = tfd.Normal(loc=0, scale=5)
  rv_b = tfd.Normal(loc=0, scale=5)
  rv_std = tfd.HalfCauchy(0., scale=1.)

  y_mu = m*x+b
  rv_y = tfd.Normal(loc=y_mu, scale=rv_std[..., None])

  return (rv_m.log_prob(m) + rv_b.log_prob(b)
          + rv_std.log_prob(std)
          + tf.reduce_sum(rv_y.log_prob(y)))

# Define a closure over our joint_log_prob.
def target_log_prob_fn(m, b, std):
    return joint_log_prob(x, y, m, b, std)

# Run variational inference to initialize per-variable step sizes.
surrogate_posterior = tfp.experimental.vi.build_factored_surrogate_posterior(
    event_shape=[[], [], []],
    constraining_bijectors=[None, None, tfb.Exp()])
losses = tfp.vi.fit_surrogate_posterior(
    target_log_prob_fn,
    surrogate_posterior,
    optimizer=tf.optimizers.Adam(
        learning_rate=0.1,
        # Decay second-moment estimates to aid optimizing scale parameters.
        beta_2=0.9),
    num_steps=1000)
approximate_posterior_stddevs = [np.std(z) for z in surrogate_posterior.sample(50)]

@tf.function(autograph=False)
def do_sampling():
  kernel=tfp.mcmc.HamiltonianMonteCarlo(
          target_log_prob_fn=target_log_prob_fn,
          step_size=approximate_posterior_stddevs,
          num_leapfrog_steps=16)
  kernel = tfp.mcmc.TransformedTransitionKernel(
      inner_kernel=kernel,
      bijector=[tfb.Identity(),
                tfb.Identity(),
                tfb.Exp()]
  )
  kernel = tfp.mcmc.DualAveragingStepSizeAdaptation(
        inner_kernel=kernel,
        num_adaptation_steps=int(num_burnin_steps * 0.8))
  return tfp.mcmc.sample_chain(
      num_results=num_results,
      num_burnin_steps=num_burnin_steps,
      current_state=[
          0.01 * tf.ones([], name='init_m', dtype=tf.float32),
          0.01 * tf.ones([], name='init_b', dtype=tf.float32),
          1. * tf.ones([], name='init_std', dtype=tf.float32)
      ],
      kernel=kernel,
      trace_fn=lambda _, pkr: [pkr.inner_results.inner_results.accepted_results.step_size,
                               pkr.inner_results.inner_results.log_accept_ratio])

samples, [step_size, log_accept_ratio] = do_sampling()
m_posterior, b_posterior, std_posterior = samples

p_accept = tf.reduce_mean(tf.exp(tf.minimum(log_accept_ratio, 0.)))
print('Acceptance rate: {}'.format(p_accept))

n_v = len(samples)
true_values = [m, b, std]

plt.figure(figsize=(12, 12))
plt.title('Visualizing trace and posterior distributions')
for i, (sample, true_value) in enumerate(zip(samples, true_values)):
  plt.subplot(2*n_v, 2, 2*i+1)
  plt.plot(sample)
  plt.subplot(2*n_v, 2, 2*i+2)
  plt.hist(sample.numpy())
  plt.axvline(x=true_value)