Continuing on my current “Reinforcement Learning” path we are now going to try the Q network implementation that we will train on the Frozenlake environment again.
import gym import numpy as np import random import tensorflow as tf from nv.core.utils import * def train_qn_env(numEpisodes=2000, ename="FrozenLake-v0"): logDEBUG("Building FrozenLake environment...") env = gym.make(ename) # Implementation of the network: tf.reset_default_graph() # These lines establish the feed-forward part of the network used to choose actions inputs1 = tf.placeholder(shape=[1,16],dtype=tf.float32) W = tf.Variable(tf.random_uniform([16,4],0,0.01)) Qout = tf.matmul(inputs1,W) predict = tf.argmax(Qout,1) # Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values. nextQ = tf.placeholder(shape=[1,4],dtype=tf.float32) loss = tf.reduce_sum(tf.square(nextQ - Qout)) trainer = tf.train.GradientDescentOptimizer(learning_rate=0.1) updateModel = trainer.minimize(loss) # Actual training of the model: logDEBUG("Initializing training...") init=tf.global_variables_initializer() # Set learning parameters y = .99 e = 0.1 # Create lists to contain total rewards and steps per episode jList = [] rList = [] with tf.Session() as sess: sess.run(init) for i in range(numEpisodes): # Reset environment and get first new observation s = env.reset() rAll = 0 d = False j = 0 # The Q-Network while j < 99: j+=1 # Choose an action by greedily (with e chance of random action) from the Q-network a,allQ = sess.run([predict,Qout],feed_dict={inputs1:np.identity(16)[s:s+1]}) if np.random.rand(1) < e: a[0] = env.action_space.sample() # Get new state and reward from environment s1,r,d,_ = env.step(a[0]) # Obtain the Q' values by feeding the new state through our network Q1 = sess.run(Qout,feed_dict={inputs1:np.identity(16)[s1:s1+1]}) # Obtain maxQ' and set our target value for chosen action. maxQ1 = np.max(Q1) targetQ = allQ targetQ[0,a[0]] = r + y*maxQ1 # Train our network using target and predicted Q values _,W1 = sess.run([updateModel,W],feed_dict={inputs1:np.identity(16)[s:s+1],nextQ:targetQ}) rAll += r s = s1 if d == True: # Reduce chance of random action as we train the model. e = 1./((i/50) + 10) break jList.append(j) rList.append(rAll) logDEBUG("Episode %d/%d: reward: %f" % (i+1, numEpisodes, rAll)) logDEBUG("Percent of succesful episodes: " + str(sum(rList)/numEpisodes) + "%")
def train_qn_env_v2(numEpisodes=2000, ename="FrozenLake-v0"): logDEBUG("Building FrozenLake environment...") env = gym.make(ename) # Implementation of the network: tf.reset_default_graph() # These lines establish the feed-forward part of the network used to choose actions # inputs = tf.placeholder(shape=[1,16],dtype=tf.float32) inputs = tf.placeholder_with_default(np.identity(16, dtype=np.float32), (16,16), "states") W = tf.Variable(tf.random_uniform([16,4],0,0.01)) Qout = tf.matmul(inputs,W) # Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values. nextQ = tf.placeholder(shape=[16,4],dtype=tf.float32) loss = tf.reduce_sum(tf.square(nextQ - Qout)) trainer = tf.train.GradientDescentOptimizer(learning_rate=0.1) updateModel = trainer.minimize(loss) # Actual training of the model: logDEBUG("Initializing training...") init=tf.global_variables_initializer() # Set learning parameters y = .99 e = 0.1 # Array containing the total reward and number of steps per episode: rList = np.zeros((numEpisodes, 2)) with tf.Session() as sess: sess.run(init) for i in range(numEpisodes): # Reset environment and get first new observation s = env.reset() rAll = 0 d = False j = 0 action = None # The Q-Network while j < 99: j+=1 # Choose an action by greedily (with e chance of random action) from the Q-network allQ = sess.run(Qout,feed_dict={}) if np.random.rand(1) < e: action = env.action_space.sample() else: action = np.argmax(allQ[s,:]) # Get new state and reward from environment s1,r,d,_ = env.step(action) # Now we already have the Q' values since we evaluated all states above: # Obtain maxQ' and set our target value for chosen action. maxQ1 = np.max(allQ[s1,:]) targetQ = allQ targetQ[s,action] = r + y*maxQ1 # Train our network using target and predicted Q values sess.run(updateModel,feed_dict={nextQ:targetQ}) rAll += r s = s1 if d == True: # Reduce chance of random action as we train the model. e = 1./((i/50) + 10) break # And we assign our data for visualization: rList[i] = [rAll, j] if (i+1)%100==0: mvals = np.mean(rList[i-99:i+1], axis=0) logDEBUG("%d/%d: Mean reward: %f" % (i+1, numEpisodes, mvals[0])) return rList
def train_qn_env_v3(numEpisodes=2000, ename="FrozenLake-v0"): logDEBUG("Building FrozenLake environment...") env = gym.make(ename) nstates = env.observation_space.n nactions = env.action_space.n # Implementation of the network: tf.reset_default_graph() # These lines establish the feed-forward part of the network used to choose actions # inputs = tf.placeholder(shape=[1,16],dtype=tf.float32) inputs = tf.placeholder_with_default(np.identity(16, dtype=np.float32), (16,16), "states") W = tf.Variable(tf.random_uniform([16,4],0,0.01)) Qout = tf.matmul(inputs,W) # Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values. nextQ = tf.placeholder(shape=[16,4],dtype=tf.float32) loss = tf.reduce_sum(tf.square(nextQ - Qout)) trainer = tf.train.GradientDescentOptimizer(learning_rate=0.1) updateModel = trainer.minimize(loss) # Actual training of the model: logDEBUG("Initializing training...") init=tf.global_variables_initializer() # Set learning parameters y = .99 e = 0.1 # Array containing the total reward and number of steps per episode: rList = np.zeros((numEpisodes, 2)) # Here we will also use a state transition tensor: tProbs = np.zeros((nstates, nactions, nstates)) # And a reward probability matrix: # Note: the first column of this matrix tells us how many # observations we have, and the second column tells us what # is the current total reward we got so far with this path. rProbs = np.zeros((nstates, nactions, 2)) # Target Q matrix: targetQ = np.zeros((nstates, nactions)) # epsilon value: eps=1e-10 with tf.Session() as sess: sess.run(init) for i in range(numEpisodes): # Reset environment and get first new observation s = env.reset() rAll = 0 d = False j = 0 action = None # The Q-Network while j < 99: j+=1 # Choose an action by greedily (with e chance of random action) from the Q-network allQ = sess.run(Qout,feed_dict={}) if np.random.rand(1) < e: action = env.action_space.sample() else: action = np.argmax(allQ[s,:]) # Get new state and reward from environment s1,r,d,_ = env.step(action) # Fill our transitions and reward matrices: tProbs[s, action, s1] += 1.0 rProbs[s, action, :] += [1.0, r] # Now we update all the nextQ values together: # For each (S,a) pair in the targetQ matrix, we need to set the value # to: mean_reward + y * expected_next_state value: # So we start with the mean reward: targetQ = rProbs[:,:,1]/(rProbs[:,:,0]+eps) # Compute the transition weights from each state: weights = tProbs/(np.sum(tProbs, axis=2, keepdims=True)+eps) # So for each (S, a) pair, we have the nstates weights for each possible following states # in the weight matrix above. # we need to dot product this with the max Q'(S',a') value we currently have: maxQs = np.max(allQ, axis=1) targetQ += y * np.dot(weights, maxQs) # Train our network using target and predicted Q values sess.run(updateModel,feed_dict={nextQ:targetQ}) rAll += r s = s1 if d == True: # Reduce chance of random action as we train the model. e = 1./((i/50) + 10) break # And we assign our data for visualization: rList[i] = [rAll, j] if (i+1)%100==0: mvals = np.mean(rList[i-99:i+1], axis=0) logDEBUG("%d/%d: Mean reward: %f" % (i+1, numEpisodes, mvals[0])) return rList
def updateQTarget(sourceQ, counter=5, threshold=1e-4): # Now we update all the nextQ values together: # For each (S,a) pair in the targetQ matrix, we need to set the value # to: mean_reward + y * expected_next_state value: # So we start with the mean reward: targetQ = rProbs[:,:,1]/(rProbs[:,:,0]+eps) # Compute the transition weights from each state: weights = tProbs/(np.sum(tProbs, axis=2, keepdims=True)+eps) # So for each (S, a) pair, we have the nstates weights for each possible following states # in the weight matrix above. # we need to dot product this with the max Q'(S',a') value we currently have: maxQs = np.max(sourceQ, axis=1) targetQ += y * np.dot(weights, maxQs) # Compute the difference between the source and target matrices: tval = np.sum(np.sum(np.abs(targetQ))) sval = np.sum(np.sum(np.abs(sourceQ))) if tval==sval: return targetQ diff = 2.0*abs(tval - sval)/(sval+tval) if diff > threshold and counter>0: # logDEBUG("%d: Current targetQ change ratio: %f" % (counter, diff)) return updateQTarget(targetQ, counter-1) return targetQ
# Fill our transitions and reward matrices: tProbs[s, action, s1] += 1.0 rProbs[s, action, :] += [1.0, r] targetQ = updateQTarget(allQ) # Train our network using target and predicted Q values sess.run(updateModel,feed_dict={nextQ:targetQ}) rAll += r s = s1
with: \[p_i = \frac{Q[S, a, S_i, 0]}{\sum_j Q[S, a, S_j, 0]} \]
\[Q^t_{i,j} = \sum_k p_{i,j,k} \left( R_{i,j,k} + \gamma Q^s_k \right)\]
\[Q^t_{i,j} = \sum_{k \neq i} p_{i,j,k} \left( R_{i,j,k} + \gamma Q^s_k \right) + p_{i,j,i} \left( R_{i,j,i} + \gamma Q^s_i \right)\]
targetQ = np.exp(targetQ) targetQ /= np.sum(targetQ, axis=1, keepdims=True) # the sum cannot be zero.