no way to compare when less than two revisions
Differences
This shows you the differences between two versions of the page.
— | blog:2019:0309_vanilla_policy_gradient [2020/07/10 12:11] (current) – created - external edit 127.0.0.1 | ||
---|---|---|---|
Line 1: | Line 1: | ||
+ | ====== Multi-actions vanilla policy Gradient ====== | ||
+ | |||
+ | {{tag> | ||
+ | |||
+ | As a small extension to the previous [[.0308_full_policy_gradient|policy gradient implementation]] we discussed, we are now going to study how to support **multiple actions** (ie. num_actions > 2) in the policy network. | ||
+ | |||
+ | ====== ====== | ||
+ | |||
+ | ===== References ===== | ||
+ | |||
+ | * [[https:// | ||
+ | * https:// | ||
+ | |||
+ | ===== Reference implementation ===== | ||
+ | |||
+ | * The reference implementation is somewhat different from our previous model, since a dedicated **agent** class is introduced here: <sxh python> | ||
+ | def __init__(self, | ||
+ | #These lines established the feed-forward part of the network. The agent takes a state and produces an action. | ||
+ | self.state_in= tf.placeholder(shape=[None, | ||
+ | hidden = slim.fully_connected(self.state_in, | ||
+ | self.output = slim.fully_connected(hidden, | ||
+ | self.chosen_action = tf.argmax(self.output, | ||
+ | |||
+ | #The next six lines establish the training proceedure. We feed the reward and chosen action into the network | ||
+ | #to compute the loss, and use it to update the network. | ||
+ | self.reward_holder = tf.placeholder(shape=[None], | ||
+ | self.action_holder = tf.placeholder(shape=[None], | ||
+ | | ||
+ | self.indexes = tf.range(0, tf.shape(self.output)[0]) * tf.shape(self.output)[1] + self.action_holder | ||
+ | self.responsible_outputs = tf.gather(tf.reshape(self.output, | ||
+ | |||
+ | self.loss = -tf.reduce_mean(tf.log(self.responsible_outputs)*self.reward_holder) | ||
+ | | ||
+ | tvars = tf.trainable_variables() | ||
+ | self.gradient_holders = [] | ||
+ | for idx,var in enumerate(tvars): | ||
+ | placeholder = tf.placeholder(tf.float32, | ||
+ | self.gradient_holders.append(placeholder) | ||
+ | | ||
+ | self.gradients = tf.gradients(self.loss, | ||
+ | | ||
+ | optimizer = tf.train.AdamOptimizer(learning_rate=lr) | ||
+ | self.update_batch = optimizer.apply_gradients(zip(self.gradient_holders, | ||
+ | |||
+ | * => But we are not going into such complication here, and I will just update the previous model instead, since the required change in itself is quite small: basically, all we need to do is to indroduce a softmax on multiple output neurons now. | ||
+ | |||
+ | ===== Analysis ===== | ||
+ | |||
+ | * **On the selection of responsible weights**: In the code above, we see that we are selecting the output weights that are directly responsible for the selected action to compute the loss: <sxh python> | ||
+ | self.responsible_outputs = tf.gather(tf.reshape(self.output, | ||
+ | |||
+ | self.loss = -tf.reduce_mean(tf.log(self.responsible_outputs)*self.reward_holder)</ | ||
+ | * First to compute the indices, we take the range of the output number of rows, so here this will be [0, batch_size-1], | ||
+ | * Then we multiply this range by the number of cols in the output weight matrix, and that's the number of actions, so we get the values: [0, nactions, 2*nactions, ..., (batch_size-1)*nactions] | ||
+ | * And then we add the value of the " | ||
+ | * Now, I actually not quite sure to understand **why** we would do that ? I mean, we have a softmax activation on the output layer, so from my perspective, | ||
+ | | ||
+ | ===== Our base implementation ===== | ||
+ | |||
+ | * So after some refactoring of our previous version we now have: <sxh python> | ||
+ | logDEBUG(" | ||
+ | env = gym.make(' | ||
+ | |||
+ | # Hyperparameters | ||
+ | H = 10 # number of hidden layer neurons | ||
+ | batch_size = 5 # every how many episodes to do a param update? | ||
+ | learning_rate = 1e-2 # feel free to play with this to train faster or more stably. | ||
+ | gamma = 0.99 # discount factor for reward | ||
+ | |||
+ | D = 4 # input dimensionality | ||
+ | |||
+ | nactions = 2 | ||
+ | # Array of the possible actions: | ||
+ | actions = np.arange(nactions) | ||
+ | |||
+ | def discount_rewards(r): | ||
+ | """ | ||
+ | discounted_r = np.zeros_like(r) | ||
+ | running_add = 0 | ||
+ | for t in reversed(range(0, | ||
+ | running_add = running_add * gamma + r[t] | ||
+ | discounted_r[t] = running_add | ||
+ | return discounted_r | ||
+ | | ||
+ | tf.reset_default_graph() | ||
+ | |||
+ | # This defines the network as it goes from taking an observation of the environment to | ||
+ | # giving a probability of chosing to the action of moving left or right. | ||
+ | observations = tf.placeholder(tf.float32, | ||
+ | W1 = tf.get_variable(" | ||
+ | initializer=tf.contrib.layers.xavier_initializer()) | ||
+ | layer1 = tf.nn.relu(tf.matmul(observations, | ||
+ | W2 = tf.get_variable(" | ||
+ | initializer=tf.contrib.layers.xavier_initializer()) | ||
+ | score = tf.matmul(layer1, | ||
+ | | ||
+ | # We build a softmax layer on top of the outputs: | ||
+ | probability = tf.nn.softmax(score) | ||
+ | |||
+ | # From here we define the parts of the network needed for learning a good policy. | ||
+ | tvars = tf.trainable_variables() | ||
+ | |||
+ | reward_holder = tf.placeholder(shape=[None], | ||
+ | action_holder = tf.placeholder(shape=[None], | ||
+ | | ||
+ | indexes = tf.range(0, tf.shape(probability)[0]) * tf.shape(probability)[1] + action_holder | ||
+ | responsible_outputs = tf.gather(tf.reshape(probability, | ||
+ | |||
+ | loss = -tf.reduce_mean(tf.log(responsible_outputs)*reward_holder) | ||
+ | |||
+ | newGrads = tf.gradients(loss, | ||
+ | |||
+ | # Once we have collected a series of gradients from multiple episodes, we apply them. | ||
+ | # We don't just apply gradeients after every episode in order to account for noise in the reward signal. | ||
+ | adam = tf.train.AdamOptimizer(learning_rate=learning_rate) # Our optimizer | ||
+ | | ||
+ | gradient_holders = [] | ||
+ | for idx,var in enumerate(tvars): | ||
+ | placeholder = tf.placeholder(tf.float32, | ||
+ | gradient_holders.append(placeholder) | ||
+ | |||
+ | updateGrads = adam.apply_gradients(zip(gradient_holders, | ||
+ | | ||
+ | # Running the training: | ||
+ | xs, | ||
+ | running_reward = None | ||
+ | reward_sum = 0 | ||
+ | episode_number = 1 | ||
+ | total_episodes = 10000 | ||
+ | init = tf.global_variables_initializer() | ||
+ | |||
+ | # Launch the graph | ||
+ | with tf.Session() as sess: | ||
+ | rendering = False | ||
+ | sess.run(init) | ||
+ | observation = env.reset() # Obtain an initial observation of the environment | ||
+ | |||
+ | # Reset the gradient placeholder. We will collect gradients in | ||
+ | # gradBuffer until we are ready to update our policy network. | ||
+ | gradBuffer = sess.run(tvars) | ||
+ | for ix,grad in enumerate(gradBuffer): | ||
+ | gradBuffer[ix] = grad * 0 | ||
+ | | ||
+ | while episode_number <= total_episodes: | ||
+ | # Make sure the observation is in a shape the network can handle. | ||
+ | x = np.reshape(observation, | ||
+ | | ||
+ | # Probabilistically pick an action given our network outputs. | ||
+ | probs = sess.run(probability, | ||
+ | # logDEBUG(" | ||
+ | action = np.random.choice(actions, | ||
+ | |||
+ | xs.append(x) # observation | ||
+ | ys.append(action) # a "fake label" | ||
+ | |||
+ | # step the environment and get new measurements | ||
+ | observation, | ||
+ | reward_sum += reward | ||
+ | |||
+ | drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action) | ||
+ | |||
+ | if done: | ||
+ | episode_number += 1 | ||
+ | # stack together all inputs, hidden states, action gradients, and rewards for this episode | ||
+ | epx = np.vstack(xs) | ||
+ | epy = np.vstack(ys) | ||
+ | epr = np.vstack(drs) | ||
+ | tfp = tfps | ||
+ | xs, | ||
+ | |||
+ | # compute the discounted reward backwards through time | ||
+ | discounted_epr = discount_rewards(epr) | ||
+ | # size the rewards to be unit normal (helps control the gradient estimator variance) | ||
+ | discounted_epr -= np.mean(discounted_epr) | ||
+ | discounted_epr /= np.std(discounted_epr) | ||
+ | | ||
+ | # Get the gradient for this episode, and save it in the gradBuffer | ||
+ | # logDEBUG(" | ||
+ | # logDEBUG(" | ||
+ | # logDEBUG(" | ||
+ | |||
+ | tGrad = sess.run(newGrads, | ||
+ | for ix,grad in enumerate(tGrad): | ||
+ | gradBuffer[ix] += grad | ||
+ | | ||
+ | # If we have completed enough episodes, then update the policy network with our gradients. | ||
+ | if episode_number % batch_size == 0: | ||
+ | sess.run(updateGrads, | ||
+ | for ix,grad in enumerate(gradBuffer): | ||
+ | gradBuffer[ix] = grad * 0 | ||
+ | | ||
+ | # Give a summary of how well our network is doing for each batch of episodes. | ||
+ | running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01 | ||
+ | logDEBUG(' | ||
+ | batch_size, reward_sum// | ||
+ | | ||
+ | if reward_sum// | ||
+ | logDEBUG(" | ||
+ | break | ||
+ | | ||
+ | reward_sum = 0 | ||
+ | | ||
+ | observation = env.reset() | ||
+ | | ||
+ | logDEBUG(" | ||
+ | |||
+ | * And the training is working :-) So we are all good. | ||
+ | |||
+ | * But, still, we have a warning from tensorflow: < | ||
+ | " | ||
+ | * => I believe this warning could be due to the slicing we perform to retrieve the " | ||
+ | * I replaced the code: <sxh python> | ||
+ | responsible_outputs = tf.gather(tf.reshape(probability, | ||
+ | |||
+ | loss = -tf.reduce_mean(tf.log(responsible_outputs)*reward_holder)</ | ||
+ | * With the new code: <sxh python> | ||
+ | * => And **Yes**! This is still working just fine, and the tensorflow warning message is gone now. All right! :-) | ||