强化学习--DDPG---tensorflow实现

完整代码：https://github.com/zle1992/Reinforcement_Learning_Game

论文《Continuous control with deep reinforcement learning》https://arxiv.org/pdf/1509.02971.pdf

Deep_Deterministic_Policy_Gradient

DDPG与AC的区别：

AC:

　　Actor: 利用td_error更新参数，td_error 来自Critic

　　Critic:根据value(s)函数的贝尔曼方程更新梯度

DDPG:

　　Actor: maximize the q，输出action

　　Critic：根据Q(s,a)函数的贝尔曼方程更新梯度, 输出q值

DDPG 只能预测连续的动作输出。

逻辑梳理：

1、DDPG是AC 模型，输入包括（S,R,S_,A）

2、Actor

intput:(S)

output: a

loss :max(q)

q 来自Critic

3、Critic

input : S 、A

output: q

loss: R+ GAMMA * q_ - q

问题来了，q_ how to get? ---->Critic网络可以输入（S_,a_）得到q_ 但是，不能用同一个网络啊，所以，利用错位时间，我们使用Critic2（不可训练的）

Critic2需要a_ how to get?/----->Action网络可以输出（S_）得到a_，同理，我们使用Actor2(不可训练的)得到a_

流程

a = actor(s ,trian)

a_ = actor(s_,not_train)

q = critic(s,a trian)

q_critic(s_,a_,not_train)

a_loss = max(q)

c_loss = R+ GAMMA * q_ - q

强化学习--DDPG---tensorflow实现-LMLPHP

代码：

DDPY.py

 import os

 import numpy as np

 import tensorflow as tf

 from abc import ABCMeta, abstractmethod

 np.random.seed(1)

 tf.set_random_seed(1)

 import logging  # 引入logging模块

 logging.basicConfig(level=logging.DEBUG,

                     format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')  # logging.basicConfig函数对日志的输出格式及方式做相关配置

 # 由于日志基本配置中级别设置为DEBUG，所以一下打印信息将会全部显示在控制台上

 tfconfig = tf.ConfigProto()

 tfconfig.gpu_options.allow_growth = True

 session = tf.Session(config=tfconfig)

 class DDPG(object):

     __metaclass__ = ABCMeta

     """docstring for ACNetwork"""

     def __init__(self,

             n_actions,

             n_features,

             reward_decay,

             lr_a,

             lr_c,

             memory_size,

             output_graph,

             log_dir,

             model_dir,

             TAU,

             a_bound,

             ):

         super(DDPG, self).__init__()

         self.n_actions = n_actions

         self.n_features = n_features

         self.gamma=reward_decay

         self.memory_size =memory_size

         self.output_graph=output_graph

         self.lr_a =lr_a

         self.lr_c = lr_c

         self.log_dir = log_dir

         self.model_dir = model_dir

         # total learning step

         self.learn_step_counter = 0

         self.TAU = TAU     # soft replacement

         self.a_bound = a_bound

         self.s = tf.placeholder(tf.float32,[None]+self.n_features,name='s')

         self.s_next = tf.placeholder(tf.float32,[None]+self.n_features,name='s_next')

         self.r = tf.placeholder(tf.float32,[None,],name='r')

         #self.a = tf.placeholder(tf.int32,[None,1],name='a')

         with tf.variable_scope('Actor'):

             self.a = self._build_a_net(self.s, scope='eval', trainable=True)

             a_ = self._build_a_net(self.s_next, scope='target', trainable=False)

         with tf.variable_scope('Critic'):

             q  = self._build_c_net(self.s, self.a,scope='eval', trainable=True)

             q_  = self._build_c_net(self.s_next, a_,scope='target', trainable=False)

         # networks parameters

         self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')

         self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')

         self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')

         self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')

         with tf.variable_scope('train_op_actor'):

             self.loss_actor = -tf.reduce_mean(q)

             self.train_op_actor = tf.train.AdamOptimizer(self.lr_a).minimize(self.loss_actor,var_list=self.ae_params)

         with tf.variable_scope('train_op_critic'):

             q_target = self.r + self.gamma * q_

             self.loss_critic =tf.losses.mean_squared_error(labels=q_target, predictions=q)

             self.train_op_critic = tf.train.AdamOptimizer(self.lr_c).minimize(self.loss_critic,var_list=self.ce_params)

             # target net replacement

         self.soft_replace = [tf.assign(t, (1 - self.TAU) * t + self.TAU * e)

                                for t, e in zip(self.at_params + self.ct_params, self.ae_params + self.ce_params)]

         self.sess = tf.Session()

         if self.output_graph:

             tf.summary.FileWriter(self.log_dir,self.sess.graph)

         self.sess.run(tf.global_variables_initializer())

         self.cost_his =[0]

         self.cost =0

         self.saver = tf.train.Saver()

         if not os.path.exists(self.model_dir):

             os.mkdir(self.model_dir)

         checkpoint = tf.train.get_checkpoint_state(self.model_dir)

         if checkpoint and checkpoint.model_checkpoint_path:

             self.saver.restore(self.sess, checkpoint.model_checkpoint_path)

             print ("Loading Successfully")

             self.learn_step_counter = int(checkpoint.model_checkpoint_path.split('-')[-1]) + 1

     @abstractmethod

     def _build_a_net(self,x,scope,trainable):

         raise NotImplementedError

     def _build_c_net(self,x,scope,trainable):

         raise NotImplementedError

     def learn(self,data):

         # soft target replacement

         self.sess.run(self.soft_replace)

         batch_memory_s = data['s']

         batch_memory_a =  data['a']

         batch_memory_r = data['r']

         batch_memory_s_ = data['s_']

         _, cost = self.sess.run(

             [self.train_op_actor, self.loss_actor],

             feed_dict={

                 self.s: batch_memory_s,

             })

         _, cost = self.sess.run(

             [self.train_op_critic, self.loss_critic],

             feed_dict={

                 self.s: batch_memory_s,

                 self.a: batch_memory_a,

                 self.r: batch_memory_r,

                 self.s_next: batch_memory_s_,

             })

         self.cost_his.append(cost)

         self.cost =cost

         self.learn_step_counter += 1

             # save network every 100000 iteration

         if self.learn_step_counter % 10000 == 0:

             self.saver.save(self.sess,self.model_dir,global_step=self.learn_step_counter)

     def choose_action(self,s):

         return self.sess.run(self.a, {self.s: s[np.newaxis,:]})[0]

         # s = s[np.newaxis,:]

         # probs = self.sess.run(self.acts_prob,feed_dict={self.s:s})

         # return np.random.choice(np.arange(probs.shape[1]), p=probs.ravel())

game.py

 import sys

 import gym

 import numpy as np

 import tensorflow as tf

 sys.path.append('./')

 sys.path.append('model')

 from util import Memory ,StateProcessor

 from DDPG import DDPG

 from ACNetwork import ACNetwork

 np.random.seed(1)

 tf.set_random_seed(1)

 import logging  # 引入logging模块

 logging.basicConfig(level=logging.DEBUG,

                     format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')  # logging.basicConfig函数对日志的输出格式及方式做相关配置

 # 由于日志基本配置中级别设置为DEBUG，所以一下打印信息将会全部显示在控制台上

 import os

 os.environ["CUDA_VISIBLE_DEVICES"] = ""

 tfconfig = tf.ConfigProto()

 tfconfig.gpu_options.allow_growth = True

 session = tf.Session(config=tfconfig)

 class DDPG4Pendulum(DDPG):

     """docstring for ClassName"""

     def __init__(self, **kwargs):

         super(DDPG4Pendulum, self).__init__(**kwargs)

     def _build_a_net(self,s,scope,trainable):

         w_initializer, b_initializer = tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1)

         #w_initializer, b_initializer = None,None

         with tf.variable_scope(scope):

             e1 = tf.layers.dense(inputs=s,

                     units=30,

                     bias_initializer = b_initializer,

                     kernel_initializer=w_initializer,

                     activation = tf.nn.relu,

                     trainable=trainable)

             a = tf.layers.dense(inputs=e1,

                     units=self.n_actions,

                     bias_initializer = b_initializer,

                     kernel_initializer=w_initializer,

                     activation = tf.nn.tanh,

                     trainable=trainable)

         return tf.multiply(a, self.a_bound, name='scaled_a')

     def _build_c_net(self,s,a,scope,trainable):

         w_initializer, b_initializer = tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1)

         with tf.variable_scope(scope):

             n_l1 = 30

             w1_s = tf.get_variable('w1_s',self.n_features+[n_l1],trainable=trainable)

             w1_a = tf.get_variable('w1_a',[self.n_actions,n_l1],trainable=trainable)

             b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)

             net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)

             q = tf.layers.dense(inputs=net,

                     units=1,

                     bias_initializer = b_initializer,

                     kernel_initializer=w_initializer,

                     activation =None,

                     trainable=trainable)

         return q

 batch_size = 32

 memory_size  =10000

 env = gym.make('Pendulum-v0') #连续

 n_features= [env.observation_space.shape[0]]

 n_actions= env.action_space.shape[0]

 a_bound = env.action_space.high

 env = env.unwrapped

 MAX_EP_STEPS =200

 def run():

     RL = DDPG4Pendulum(

         n_actions=n_actions,

         n_features=n_features,

         reward_decay=0.9,

         lr_a = 0.001,

         lr_c = 0.002,

         memory_size=memory_size,

         TAU = 0.01,

         output_graph=False,

         log_dir = 'Pendulum/log/DDPG4Pendulum/',

         a_bound =a_bound,

         model_dir = 'Pendulum/model_dir/DDPG4Pendulum/'

         )

     memory = Memory(n_actions,n_features,memory_size=memory_size)

     var = 3  # control exploration

     step = 0

     for episode in range(2000):

         # initial observation

         observation = env.reset()

         ep_r = 0

         for j in range(MAX_EP_STEPS):

             # RL choose action based on observation

             action = RL.choose_action(observation)

             action = np.clip(np.random.normal(action, var), -2, 2)    # add randomness to action selection for exploration

             # RL take action and get_collectiot next observation and reward

             observation_, reward, done, info=env.step(action) # take a random action

             #print('step:%d---episode:%d----reward:%f---action:%f'%(step,episode,reward,action))

             memory.store_transition(observation, action, reward/10, observation_)

             if step > memory_size:

                 #env.render()

                 var *= .9995    # decay the action randomness

                 data = memory.sample(batch_size)

                 RL.learn(data)

             # swap observation

             observation = observation_

             ep_r += reward

             # break while loop when end of this episode

             if(episode>200):

                 env.render()  # render on the screen

             if j == MAX_EP_STEPS-1:

                 print('step: ',step,

                     'episode: ', episode,

                       'ep_r: ', round(ep_r, 2),

                       'var:',var,

                       #loss: ',RL.cost

                       )

                 break

             step += 1

     # end of game

     print('game over')

     env.destroy()

 def main():

     run()

 if __name__ == '__main__':

     main()

     #run2()