Now finally something that sounds a bit more challenging on my Reinforcement Learning journey: Deep Q-Networks! Even if, when you start analysis the structure of those networks, they are still pretty simple in the end. But still, there are 3 key elements that we should consider here: convolutional layers, experience replay, and Target network usage.
On top of that we will also initroduce the Double DQN and Dueling DQN extensions, that are used to improve the system stability and performances. Let's get started!
import numpy as np import random import gym import tensorflow as tf import tensorflow.contrib.slim as slim import numpy as np from nv.core.utils import * from nv.deep_learning.gridworld import gameEnv # cf. https://medium.com/@awjuliani/simple-reinforcement-learning-with-tensorflow-part-4-deep-q-networks-and-beyond-8438a3e2b8df # Implementing the network itself: class Qnetwork(): def __init__(self,h_size, env): #The network recieves a frame from the game, flattened into an array. #It then resizes it and processes it through four convolutional layers. self.scalarInput = tf.placeholder(shape=[None,21168],dtype=tf.float32) self.imageIn = tf.reshape(self.scalarInput,shape=[-1,84,84,3]) self.conv1 = slim.conv2d( \ inputs=self.imageIn,num_outputs=32,kernel_size=[8,8],stride=[4,4],padding='VALID', biases_initializer=None) self.conv2 = slim.conv2d( \ inputs=self.conv1,num_outputs=64,kernel_size=[4,4],stride=[2,2],padding='VALID', biases_initializer=None) self.conv3 = slim.conv2d( \ inputs=self.conv2,num_outputs=64,kernel_size=[3,3],stride=[1,1],padding='VALID', biases_initializer=None) self.conv4 = slim.conv2d( \ inputs=self.conv3,num_outputs=h_size,kernel_size=[7,7],stride=[1,1],padding='VALID', biases_initializer=None) #We take the output from the final convolutional layer and split it into separate advantage and value streams. self.streamAC,self.streamVC = tf.split(self.conv4,2,3) self.streamA = slim.flatten(self.streamAC) self.streamV = slim.flatten(self.streamVC) xavier_init = tf.contrib.layers.xavier_initializer() self.AW = tf.Variable(xavier_init([h_size//2,env.actions])) self.VW = tf.Variable(xavier_init([h_size//2,1])) self.Advantage = tf.matmul(self.streamA,self.AW) self.Value = tf.matmul(self.streamV,self.VW) #Then combine them together to get our final Q-values. self.Qout = self.Value + tf.subtract(self.Advantage,tf.reduce_mean(self.Advantage,axis=1,keep_dims=True)) self.predict = tf.argmax(self.Qout,1) #Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values. self.targetQ = tf.placeholder(shape=[None],dtype=tf.float32) self.actions = tf.placeholder(shape=[None],dtype=tf.int32) self.actions_onehot = tf.one_hot(self.actions,env.actions,dtype=tf.float32) self.Q = tf.reduce_sum(tf.multiply(self.Qout, self.actions_onehot), axis=1) self.td_error = tf.square(self.targetQ - self.Q) self.loss = tf.reduce_mean(self.td_error) self.trainer = tf.train.AdamOptimizer(learning_rate=0.0001) self.updateModel = self.trainer.minimize(self.loss) # Experience replay storage class: class experience_buffer(): def __init__(self, buffer_size = 50000): self.buffer = [] self.buffer_size = buffer_size def add(self,experience): if len(self.buffer) + len(experience) >= self.buffer_size: self.buffer[0:(len(experience)+len(self.buffer))-self.buffer_size] = [] self.buffer.extend(experience) def sample(self,size): return np.reshape(np.array(random.sample(self.buffer,size)),[size,5]) # Function to resize our game frames. def processState(states): return np.reshape(states,[21168]) # These functions allow us to update the parameters of our target network with those of the primary network. def updateTargetGraph(tfVars,tau): total_vars = len(tfVars) op_holder = [] for idx,var in enumerate(tfVars[0:total_vars//2]): op_holder.append(tfVars[idx+total_vars//2].assign((var.value()*tau) + ((1-tau)*tfVars[idx+total_vars//2].value()))) return op_holder def updateTarget(op_holder,sess): for op in op_holder: sess.run(op) def train_deep_q_network(path): logDEBUG("Building environment...") env = gameEnv(partial=False,size=5) # training parameters: batch_size = 32 #How many experiences to use for each training step. update_freq = 4 #How often to perform a training step. y = .99 #Discount factor on the target Q-values startE = 1 #Starting chance of random action endE = 0.1 #Final chance of random action annealing_steps = 10000. #How many steps of training to reduce startE to endE. num_episodes = 10000 #How many episodes of game environment to train network with. pre_train_steps = 10000 #How many steps of random actions before training begins. max_epLength = 50 #The max allowed length of our episode. load_model = False #Whether to load a saved model. h_size = 512 #The size of the final convolutional layer before splitting it into Advantage and Value streams. tau = 0.001 #Rate to update target network toward primary network # Actual training process: tf.reset_default_graph() mainQN = Qnetwork(h_size, env) targetQN = Qnetwork(h_size, env) init = tf.global_variables_initializer() saver = tf.train.Saver() trainables = tf.trainable_variables() targetOps = updateTargetGraph(trainables,tau) myBuffer = experience_buffer() #Set the rate of random action decrease. e = startE stepDrop = (startE - endE)/annealing_steps #create lists to contain total rewards and steps per episode jList = [] rList = [] total_steps = 0 # ensure the provided path already exists: CHECK(nvDirExists(path), "Invalid storage path: %s" % path) with tf.Session() as sess: sess.run(init) if load_model == True: logDEBUG('Loading Model...') ckpt = tf.train.get_checkpoint_state(path) saver.restore(sess,ckpt.model_checkpoint_path) for i in range(num_episodes): episodeBuffer = experience_buffer() #Reset environment and get first new observation s = env.reset() s = processState(s) d = False rAll = 0 j = 0 #The Q-Network while j < max_epLength: #If the agent takes longer than 200 moves to reach either of the blocks, end the trial. j+=1 #Choose an action by greedily (with e chance of random action) from the Q-network if np.random.rand(1) < e or total_steps < pre_train_steps: a = np.random.randint(0,4) else: a = sess.run(mainQN.predict,feed_dict={mainQN.scalarInput:[s]})[0] s1,r,d = env.step(a) s1 = processState(s1) total_steps += 1 episodeBuffer.add(np.reshape(np.array([s,a,r,s1,d]),[1,5])) #Save the experience to our episode buffer. if total_steps > pre_train_steps: if e > endE: e -= stepDrop if total_steps % (update_freq) == 0: trainBatch = myBuffer.sample(batch_size) #Get a random batch of experiences. #Below we perform the Double-DQN update to the target Q-values Q1 = sess.run(mainQN.predict,feed_dict={mainQN.scalarInput:np.vstack(trainBatch[:,3])}) Q2 = sess.run(targetQN.Qout,feed_dict={targetQN.scalarInput:np.vstack(trainBatch[:,3])}) end_multiplier = -(trainBatch[:,4] - 1) doubleQ = Q2[range(batch_size),Q1] targetQ = trainBatch[:,2] + (y*doubleQ * end_multiplier) #Update the network with our target values. _ = sess.run(mainQN.updateModel, \ feed_dict={mainQN.scalarInput:np.vstack(trainBatch[:,0]),mainQN.targetQ:targetQ, mainQN.actions:trainBatch[:,1]}) updateTarget(targetOps,sess) #Update the target network toward the primary network. rAll += r s = s1 if d == True: break myBuffer.add(episodeBuffer.buffer) jList.append(j) rList.append(rAll) #Periodically save the model. if i % 1000 == 0: saver.save(sess,path+'/model-'+str(i)+'.ckpt') logDEBUG("Saved Model") if len(rList) % 10 == 0: logDEBUG("Total steps: %d, mean value: %f, e: %f" % (total_steps,np.mean(rList[-10:]), e)) saver.save(sess,path+'/model-'+str(i)+'.ckpt') logDEBUG("Percent of succesful episodes: " + str(sum(rList)/num_episodes) + "%")
2019-03-10T12:09:06.201518 [DEBUG] Total steps: 498500, mean value: 22.700000, e: 0.100000 2019-03-10T12:09:08.648956 [DEBUG] Total steps: 499000, mean value: 22.600000, e: 0.100000 2019-03-10T12:09:11.015712 [DEBUG] Total steps: 499500, mean value: 23.200000, e: 0.100000 2019-03-10T12:09:13.457738 [DEBUG] Total steps: 500000, mean value: 24.300000, e: 0.100000 2019-03-10T12:09:13.722211 [DEBUG] Percent of succesful episodes: 20.3113% 2019-03-10T12:09:13.784543 [DEBUG] Training completed in 2552.745198 seconds.
self.streamAC,self.streamVC = tf.split(self.conv4,2,3)
self.streamA = slim.flatten(self.streamAC) self.streamV = slim.flatten(self.streamVC)
self.Qout = self.Value + tf.subtract(self.Advantage,tf.reduce_mean(self.Advantage,axis=1,keep_dims=True)) self.predict = tf.argmax(self.Qout,1)
self.actions = tf.placeholder(shape=[None],dtype=tf.int32) self.actions_onehot = tf.one_hot(self.actions,env.actions,dtype=tf.float32)
self.Q = tf.reduce_sum(tf.multiply(self.Qout, self.actions_onehot), axis=1)
self.td_error = tf.square(self.targetQ - self.Q) self.loss = tf.reduce_mean(self.td_error)
def updateTargetGraph(tfVars,tau): total_vars = len(tfVars) op_holder = [] for idx,var in enumerate(tfVars[0:total_vars//2]): op_holder.append(tfVars[idx+total_vars//2].assign((var.value()*tau) + ((1-tau)*tfVars[idx+total_vars//2].value()))) return op_holder
Q1 = sess.run(mainQN.predict,feed_dict={mainQN.scalarInput:np.vstack(trainBatch[:,3])}) Q2 = sess.run(targetQN.Qout,feed_dict={targetQN.scalarInput:np.vstack(trainBatch[:,3])})