As a small extension to the previous policy gradient implementation we discussed, we are now going to study how to support multiple actions (ie. num_actions > 2) in the policy network.
class agent(): def __init__(self, lr, s_size,a_size,h_size): #These lines established the feed-forward part of the network. The agent takes a state and produces an action. self.state_in= tf.placeholder(shape=[None,s_size],dtype=tf.float32) hidden = slim.fully_connected(self.state_in,h_size,biases_initializer=None,activation_fn=tf.nn.relu) self.output = slim.fully_connected(hidden,a_size,activation_fn=tf.nn.softmax,biases_initializer=None) self.chosen_action = tf.argmax(self.output,1) #The next six lines establish the training proceedure. We feed the reward and chosen action into the network #to compute the loss, and use it to update the network. self.reward_holder = tf.placeholder(shape=[None],dtype=tf.float32) self.action_holder = tf.placeholder(shape=[None],dtype=tf.int32) self.indexes = tf.range(0, tf.shape(self.output)[0]) * tf.shape(self.output)[1] + self.action_holder self.responsible_outputs = tf.gather(tf.reshape(self.output, [-1]), self.indexes) self.loss = -tf.reduce_mean(tf.log(self.responsible_outputs)*self.reward_holder) tvars = tf.trainable_variables() self.gradient_holders = [] for idx,var in enumerate(tvars): placeholder = tf.placeholder(tf.float32,name=str(idx)+'_holder') self.gradient_holders.append(placeholder) self.gradients = tf.gradients(self.loss,tvars) optimizer = tf.train.AdamOptimizer(learning_rate=lr) self.update_batch = optimizer.apply_gradients(zip(self.gradient_holders,tvars))
self.indexes = tf.range(0, tf.shape(self.output)[0]) * tf.shape(self.output)[1] + self.action_holder self.responsible_outputs = tf.gather(tf.reshape(self.output, [-1]), self.indexes) self.loss = -tf.reduce_mean(tf.log(self.responsible_outputs)*self.reward_holder)
def train_policy_network_v2(): logDEBUG("Building environment...") env = gym.make('CartPole-v0') # Hyperparameters H = 10 # number of hidden layer neurons batch_size = 5 # every how many episodes to do a param update? learning_rate = 1e-2 # feel free to play with this to train faster or more stably. gamma = 0.99 # discount factor for reward D = 4 # input dimensionality nactions = 2 # Array of the possible actions: actions = np.arange(nactions) def discount_rewards(r): """ take 1D float array of rewards and compute discounted reward """ discounted_r = np.zeros_like(r) running_add = 0 for t in reversed(range(0, r.size)): running_add = running_add * gamma + r[t] discounted_r[t] = running_add return discounted_r tf.reset_default_graph() # This defines the network as it goes from taking an observation of the environment to # giving a probability of chosing to the action of moving left or right. observations = tf.placeholder(tf.float32, [None,D] , name="input_x") W1 = tf.get_variable("W1", shape=[D, H], initializer=tf.contrib.layers.xavier_initializer()) layer1 = tf.nn.relu(tf.matmul(observations,W1)) W2 = tf.get_variable("W2", shape=[H, nactions], initializer=tf.contrib.layers.xavier_initializer()) score = tf.matmul(layer1,W2) # We build a softmax layer on top of the outputs: probability = tf.nn.softmax(score) # From here we define the parts of the network needed for learning a good policy. tvars = tf.trainable_variables() reward_holder = tf.placeholder(shape=[None],dtype=tf.float32) action_holder = tf.placeholder(shape=[None],dtype=tf.int32) indexes = tf.range(0, tf.shape(probability)[0]) * tf.shape(probability)[1] + action_holder responsible_outputs = tf.gather(tf.reshape(probability, [-1]), indexes) loss = -tf.reduce_mean(tf.log(responsible_outputs)*reward_holder) newGrads = tf.gradients(loss,tvars) # Once we have collected a series of gradients from multiple episodes, we apply them. # We don't just apply gradeients after every episode in order to account for noise in the reward signal. adam = tf.train.AdamOptimizer(learning_rate=learning_rate) # Our optimizer gradient_holders = [] for idx,var in enumerate(tvars): placeholder = tf.placeholder(tf.float32,name=str(idx)+'_holder') gradient_holders.append(placeholder) updateGrads = adam.apply_gradients(zip(gradient_holders,tvars)) # Running the training: xs,hs,dlogps,drs,ys,tfps = [],[],[],[],[],[] running_reward = None reward_sum = 0 episode_number = 1 total_episodes = 10000 init = tf.global_variables_initializer() # Launch the graph with tf.Session() as sess: rendering = False sess.run(init) observation = env.reset() # Obtain an initial observation of the environment # Reset the gradient placeholder. We will collect gradients in # gradBuffer until we are ready to update our policy network. gradBuffer = sess.run(tvars) for ix,grad in enumerate(gradBuffer): gradBuffer[ix] = grad * 0 while episode_number <= total_episodes: # Make sure the observation is in a shape the network can handle. x = np.reshape(observation,[1,D]) # Probabilistically pick an action given our network outputs. probs = sess.run(probability,feed_dict={observations:x}) # logDEBUG("Probabilities: %s" % probs) action = np.random.choice(actions,p=probs[0]) xs.append(x) # observation ys.append(action) # a "fake label" # step the environment and get new measurements observation, reward, done, info = env.step(action) reward_sum += reward drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action) if done: episode_number += 1 # stack together all inputs, hidden states, action gradients, and rewards for this episode epx = np.vstack(xs) epy = np.vstack(ys) epr = np.vstack(drs) tfp = tfps xs,hs,dlogps,drs,ys,tfps = [],[],[],[],[],[] # reset array memory # compute the discounted reward backwards through time discounted_epr = discount_rewards(epr) # size the rewards to be unit normal (helps control the gradient estimator variance) discounted_epr -= np.mean(discounted_epr) discounted_epr /= np.std(discounted_epr) # Get the gradient for this episode, and save it in the gradBuffer # logDEBUG("epx shape: %s" % str(epx)) # logDEBUG("epy shape: %s" % str(epy)) # logDEBUG("epr shape: %s" % str(discounted_epr)) tGrad = sess.run(newGrads,feed_dict={observations: epx, action_holder: epy.reshape(-1), reward_holder: discounted_epr.reshape(-1)}) for ix,grad in enumerate(tGrad): gradBuffer[ix] += grad # If we have completed enough episodes, then update the policy network with our gradients. if episode_number % batch_size == 0: sess.run(updateGrads,feed_dict=dict(zip(gradient_holders, gradBuffer))) for ix,grad in enumerate(gradBuffer): gradBuffer[ix] = grad * 0 # Give a summary of how well our network is doing for each batch of episodes. running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01 logDEBUG('%d/%d: Average reward for last %d episodes: %f. Total average reward %f.' % (episode_number, total_episodes, batch_size, reward_sum//batch_size, running_reward//batch_size)) if reward_sum//batch_size >= 200: logDEBUG("Task solved in %d episodes!" % episode_number) break reward_sum = 0 observation = env.reset() logDEBUG("%d episodes completed." % episode_number)
/mnt/array1/dev/projects/NervSeed/tools/linux/python-3.6/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py:110: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory. "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
indexes = tf.range(0, tf.shape(probability)[0]) * tf.shape(probability)[1] + action_holder responsible_outputs = tf.gather(tf.reshape(probability, [-1]), indexes) loss = -tf.reduce_mean(tf.log(responsible_outputs)*reward_holder)
loss=tf.reduce_mean(tf.multiply(reward_holder,tf.nn.sparse_softmax_cross_entropy_with_logits(logits=score,labels=action_holder)))