In this post, we are going to build a simple Policy gradient experiment, on an “n-armed bandit” problem.
import numpy as np import random import tensorflow as tf import tensorflow.contrib.slim as slim import numpy as np from nv.core.utils import * # cf. https://medium.com/@awjuliani/super-simple-reinforcement-learning-tutorial-part-1-fd544fab149 # List out our bandit arms. # Currently arm 4 (index #3) is set to most often provide a positive reward. bandit_arms = [0.2,0,-0.2,-2] num_arms = len(bandit_arms) def pullBandit(bandit): #Get a random number. result = np.random.randn(1) if result > bandit: #return a positive reward. return 1 else: #return a negative reward. return -1 def train_armed_bandit_network(): tf.reset_default_graph() # These two lines established the feed-forward part of the network. weights = tf.Variable(tf.ones([num_arms])) output = tf.nn.softmax(weights) # The next six lines establish the training proceedure. We feed the reward and chosen action into the network # to compute the loss, and use it to update the network. reward_holder = tf.placeholder(shape=[1],dtype=tf.float32) action_holder = tf.placeholder(shape=[1],dtype=tf.int32) responsible_output = tf.slice(output,action_holder,[1]) loss = -(tf.log(responsible_output)*reward_holder) optimizer = tf.train.AdamOptimizer(learning_rate=1e-3) update = optimizer.minimize(loss) total_episodes = 1000 #Set total number of episodes to train agent on. total_reward = np.zeros(num_arms) #Set scoreboard for bandit arms to 0. init = tf.global_variables_initializer() # Launch the tensorflow graph with tf.Session() as sess: sess.run(init) i = 0 while i < total_episodes: #Choose action according to Boltzmann distribution. actions = sess.run(output) a = np.random.choice(actions,p=actions) action = np.argmax(actions == a) reward = pullBandit(bandit_arms[action]) #Get our reward from picking one of the bandit arms. #Update the network. _,resp,ww = sess.run([update,responsible_output,weights], feed_dict={reward_holder:[reward],action_holder:[action]}) #Update our running tally of scores. total_reward[action] += reward if i % 50 == 0: logDEBUG("Running reward for the " + str(num_arms) + " arms of the bandit: " + str(total_reward)) i+=1 logDEBUG("\nThe agent thinks arm " + str(np.argmax(ww)+1) + " is the most promising....") if np.argmax(ww) == np.argmax(-np.array(bandit_arms)): logDEBUG("...and it was right!") else: logDEBUG("...and it was wrong!")
#Choose action according to Boltzmann distribution. actions = sess.run(output) a = np.random.choice(actions,p=actions) action = np.argmax(actions == a)
actions = np.arange(num_actions) # keep this out of the loop obviously. probs = sess.run(output) action = np.random.choice(actions,p=probs)
\[ \frac{\partial Loss}{\partial w_i} = \frac{\partial}{\partial w_i} \left(- log(out_i) \times R \right)\]
\[ \frac{\partial Loss}{\partial w_i} = -R \times \frac{\partial}{\partial w_i} \left(out_i\right) \times \frac{1}{out_i} \]
\begin{align} \frac{\partial}{\partial w_i} \left(out_i\right) & = \frac{\partial}{\partial w_i} \left(\frac{e^{w_i}}{\sum_k e^{w_k}}\right) \\ & = \frac{\partial}{\partial w_i} \left(e^{w_i}\right) \frac{1}{\sum_k e^{w_k}} + e^{w_i} \frac{\partial}{\partial w_i} \left(\frac{1}{\sum_k e^{w_k}}\right) \\ & = \frac{e^{w_i}}{\sum_k e^{w_k}} + e^{w_i} e^{w_i} \left( -\frac{1}{(\sum_k e^{w_k})^2}\right) \\ & = \frac{e^{w_i}}{\sum_k e^{w_k}} - \left( \frac{e^{w_i}}{\sum_k e^{w_k}} \right)^2 \\ & = out_i \cdot ( 1 - out_i) \end{align}
\[ \frac{\partial Loss}{\partial w_i} = - R \cdot (1 - out_i)\]
\[ w_i = w_i + \alpha \cdot R \cdot (1 - out_i)\]