1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
| """ Note: This is a updated version from my previous code, for the target network, I use moving average to soft replace target parameters instead using assign function. By doing this, it has 20% speed up on my machine (CPU).
Deep Deterministic Policy Gradient (DDPG), Reinforcement Learning. DDPG is Actor Critic based algorithm. Pendulum example.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
Using: tensorflow 1.0 gym 0.8.0 """
import tensorflow as tf import numpy as np import gym import time
MAX_EPISODES = 200 MAX_EP_STEPS = 200 LR_A = 0.001 LR_C = 0.002 GAMMA = 0.9 TAU = 0.01 MEMORY_CAPACITY = 10000 BATCH_SIZE = 32
RENDER = False ENV_NAME = 'Pendulum-v0'
class DDPG(object): def __init__(self, a_dim, s_dim, a_bound,): self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1), dtype=np.float32) self.pointer = 0 self.sess = tf.Session()
self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound, self.S = tf.placeholder(tf.float32, [None, s_dim], 's') self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_') self.R = tf.placeholder(tf.float32, [None, 1], 'r')
self.a = self._build_a(self.S,) q = self._build_c(self.S, self.a, ) a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Actor') c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Critic') ema = tf.train.ExponentialMovingAverage(decay=1 - TAU)
def ema_getter(getter, name, *args, **kwargs): return ema.average(getter(name, *args, **kwargs))
target_update = [ema.apply(a_params), ema.apply(c_params)] a_ = self._build_a(self.S_, reuse=True, custom_getter=ema_getter) q_ = self._build_c(self.S_, a_, reuse=True, custom_getter=ema_getter)
a_loss = - tf.reduce_mean(q) self.atrain = tf.train.AdamOptimizer(LR_A).minimize(a_loss, var_list=a_params)
with tf.control_dependencies(target_update): q_target = self.R + GAMMA * q_ td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q) self.ctrain = tf.train.AdamOptimizer(LR_C).minimize(td_error, var_list=c_params)
self.sess.run(tf.global_variables_initializer())
def choose_action(self, s): return self.sess.run(self.a, {self.S: s[np.newaxis, :]})[0]
def learn(self): indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE) bt = self.memory[indices, :] bs = bt[:, :self.s_dim] ba = bt[:, self.s_dim: self.s_dim + self.a_dim] br = bt[:, -self.s_dim - 1: -self.s_dim] bs_ = bt[:, -self.s_dim:]
self.sess.run(self.atrain, {self.S: bs}) self.sess.run(self.ctrain, {self.S: bs, self.a: ba, self.R: br, self.S_: bs_})
def store_transition(self, s, a, r, s_): transition = np.hstack((s, a, [r], s_)) index = self.pointer % MEMORY_CAPACITY self.memory[index, :] = transition self.pointer += 1
def _build_a(self, s, reuse=None, custom_getter=None): trainable = True if reuse is None else False with tf.variable_scope('Actor', reuse=reuse, custom_getter=custom_getter): net = tf.layers.dense(s, 30, activation=tf.nn.relu, name='l1', trainable=trainable) a = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, name='a', trainable=trainable) return tf.multiply(a, self.a_bound, name='scaled_a')
def _build_c(self, s, a, reuse=None, custom_getter=None): trainable = True if reuse is None else False with tf.variable_scope('Critic', reuse=reuse, custom_getter=custom_getter): n_l1 = 30 w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], trainable=trainable) w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], trainable=trainable) b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable) net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1) return tf.layers.dense(net, 1, trainable=trainable)
env = gym.make(ENV_NAME) env = env.unwrapped env.seed(1)
s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] a_bound = env.action_space.high
ddpg = DDPG(a_dim, s_dim, a_bound)
var = 3 t1 = time.time() for i in range(MAX_EPISODES): s = env.reset() ep_reward = 0 for j in range(MAX_EP_STEPS): if RENDER: env.render()
a = ddpg.choose_action(s) a = np.clip(np.random.normal(a, var), -2, 2) s_, r, done, info = env.step(a)
ddpg.store_transition(s, a, r / 10, s_)
if ddpg.pointer > MEMORY_CAPACITY: var *= .9995 ddpg.learn()
s = s_ ep_reward += r if j == MAX_EP_STEPS-1: print('Episode:', i, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var, ) break
print('Running time: ', time.time() - t1)
|