This time we are going to handle the creation of a full policy gradient algorithm implementation training on the OpenAI CartPole environment. As opposed to the previous simple policy gradient implementation, this time we will need to handle the previous states to decide what actions to take, and the training network will become sligthly more complex.
import numpy as np import random import gym import tensorflow as tf import tensorflow.contrib.slim as slim import numpy as np from nv.core.utils import * # cf. https://medium.com/@awjuliani/super-simple-reinforcement-learning-tutorial-part-2-ded33892c724 def train_policy_network(): logDEBUG("Building environment...") env = gym.make('CartPole-v0') # Hyperparameters H = 10 # number of hidden layer neurons batch_size = 5 # every how many episodes to do a param update? learning_rate = 1e-2 # feel free to play with this to train faster or more stably. gamma = 0.99 # discount factor for reward D = 4 # input dimensionality def discount_rewards(r): """ take 1D float array of rewards and compute discounted reward """ discounted_r = np.zeros_like(r) running_add = 0 for t in reversed(range(0, r.size)): running_add = running_add * gamma + r[t] discounted_r[t] = running_add return discounted_r tf.reset_default_graph() # This defines the network as it goes from taking an observation of the environment to # giving a probability of chosing to the action of moving left or right. observations = tf.placeholder(tf.float32, [None,D] , name="input_x") W1 = tf.get_variable("W1", shape=[D, H], initializer=tf.contrib.layers.xavier_initializer()) layer1 = tf.nn.relu(tf.matmul(observations,W1)) W2 = tf.get_variable("W2", shape=[H, 1], initializer=tf.contrib.layers.xavier_initializer()) score = tf.matmul(layer1,W2) probability = tf.nn.sigmoid(score) # From here we define the parts of the network needed for learning a good policy. tvars = tf.trainable_variables() input_y = tf.placeholder(tf.float32,[None,1], name="input_y") advantages = tf.placeholder(tf.float32,name="reward_signal") # The loss function. This sends the weights in the direction of making actions # that gave good advantage (reward over time) more likely, and actions that didn't less likely. loglik = tf.log(input_y*(input_y - probability) + (1 - input_y)*(input_y + probability)) loss = -tf.reduce_mean(loglik * advantages) newGrads = tf.gradients(loss,tvars) # Once we have collected a series of gradients from multiple episodes, we apply them. # We don't just apply gradeients after every episode in order to account for noise in the reward signal. adam = tf.train.AdamOptimizer(learning_rate=learning_rate) # Our optimizer W1Grad = tf.placeholder(tf.float32,name="batch_grad1") # Placeholders to send the final gradients through when we update. W2Grad = tf.placeholder(tf.float32,name="batch_grad2") batchGrad = [W1Grad,W2Grad] updateGrads = adam.apply_gradients(zip(batchGrad,tvars)) # Running the training: xs,hs,dlogps,drs,ys,tfps = [],[],[],[],[],[] running_reward = None reward_sum = 0 episode_number = 1 total_episodes = 10000 init = tf.global_variables_initializer() # Launch the graph with tf.Session() as sess: rendering = False sess.run(init) observation = env.reset() # Obtain an initial observation of the environment # Reset the gradient placeholder. We will collect gradients in # gradBuffer until we are ready to update our policy network. gradBuffer = sess.run(tvars) for ix,grad in enumerate(gradBuffer): gradBuffer[ix] = grad * 0 while episode_number <= total_episodes: # Rendering the environment slows things down, # so let's only look at it once our agent is doing a good job. # Note: I currently do not have support for OpenGL in custom python build. # So the rendering below is not working. # if reward_sum/batch_size > 100 or rendering == True : # env.render() # rendering = True # Make sure the observation is in a shape the network can handle. x = np.reshape(observation,[1,D]) # Run the policy network and get an action to take. tfprob = sess.run(probability,feed_dict={observations: x}) action = 1 if np.random.uniform() < tfprob else 0 xs.append(x) # observation y = 1 if action == 0 else 0 # a "fake label" ys.append(y) # step the environment and get new measurements observation, reward, done, info = env.step(action) reward_sum += reward drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action) if done: episode_number += 1 # stack together all inputs, hidden states, action gradients, and rewards for this episode epx = np.vstack(xs) epy = np.vstack(ys) epr = np.vstack(drs) tfp = tfps xs,hs,dlogps,drs,ys,tfps = [],[],[],[],[],[] # reset array memory # compute the discounted reward backwards through time discounted_epr = discount_rewards(epr) # size the rewards to be unit normal (helps control the gradient estimator variance) discounted_epr -= np.mean(discounted_epr) discounted_epr //= np.std(discounted_epr) # Get the gradient for this episode, and save it in the gradBuffer tGrad = sess.run(newGrads,feed_dict={observations: epx, input_y: epy, advantages: discounted_epr}) for ix,grad in enumerate(tGrad): gradBuffer[ix] += grad # If we have completed enough episodes, then update the policy network with our gradients. if episode_number % batch_size == 0: sess.run(updateGrads,feed_dict={W1Grad: gradBuffer[0],W2Grad:gradBuffer[1]}) for ix,grad in enumerate(gradBuffer): gradBuffer[ix] = grad * 0 # Give a summary of how well our network is doing for each batch of episodes. running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01 logDEBUG('%d/%d: Average reward for last %d episodes: %f. Total average reward %f.' % (episode_number, total_episodes, batch_size, reward_sum//batch_size, running_reward//batch_size)) if reward_sum//batch_size >= 200: logDEBUG("Task solved in %d episodes!" % episode_number) break reward_sum = 0 observation = env.reset() logDEBUG("%d episodes completed." % episode_number)
2019-03-08T14:02:38.992845 [DEBUG] 1055/10000: Average reward for last 5 episodes: 150.000000. Total average reward 78.000000. 2019-03-08T14:02:39.266814 [DEBUG] 1060/10000: Average reward for last 5 episodes: 138.000000. Total average reward 79.000000. 2019-03-08T14:02:39.511682 [DEBUG] 1065/10000: Average reward for last 5 episodes: 124.000000. Total average reward 79.000000. 2019-03-08T14:02:39.720856 [DEBUG] 1070/10000: Average reward for last 5 episodes: 105.000000. Total average reward 79.000000. 2019-03-08T14:02:40.067299 [DEBUG] 1075/10000: Average reward for last 5 episodes: 168.000000. Total average reward 80.000000. 2019-03-08T14:02:40.307538 [DEBUG] 1080/10000: Average reward for last 5 episodes: 94.000000. Total average reward 80.000000. 2019-03-08T14:02:40.711284 [DEBUG] 1085/10000: Average reward for last 5 episodes: 200.000000. Total average reward 82.000000. 2019-03-08T14:02:40.712113 [DEBUG] Task solved in 1085 episodes! 2019-03-08T14:02:40.714217 [DEBUG] 1085 episodes completed. 2019-03-08T14:02:40.715371 [DEBUG] Training completed in 33.425390 seconds.
\[Loss = - \sum_i log(y_i*(y_i - pi) + (1-y_i)*(y_i + pi)) * A_i\]
y = 1 if action == 0 else 0 # a "fake label"
# Run the policy network and get an action to take. tfprob = sess.run(probability,feed_dict={observations: x}) action = 1 if np.random.uniform() < tfprob else 0 xs.append(x) # observation y = 1 if action == 0 else 0 # a "fake label" ys.append(y)
y = 1 if action == 1 else 0 # a "fake label"
\[A_t = \sum_{k=0}^\infty \gamma^k r_{t+k}\]
# compute the discounted reward backwards through time discounted_epr = discount_rewards(epr) # size the rewards to be unit normal (helps control the gradient estimator variance) discounted_epr -= np.mean(discounted_epr) discounted_epr //= np.std(discounted_epr)