Model-based policy networks

Continuing on our reinforcement learning path, we will consider here how to build a “model for our environment” and then use this model to train our policy network, instead of the “actual environment”.

References

Initial implementation

  • So our reference implementation is here:
    import numpy as np
    import random
    import gym
    
    import tensorflow as tf
    import tensorflow.contrib.slim as slim
    import numpy as np
    
    from nv.core.utils import *
    
    # cf. https://medium.com/@awjuliani/simple-reinforcement-learning-with-tensorflow-part-3-model-based-rl-9a6fe0cce99
    
    def train_model_policy_network():
        logDEBUG("Building environment...")
        env = gym.make('CartPole-v0')
        
        # hyperparameters
        H = 8 # number of hidden layer neurons
        learning_rate = 1e-2
        gamma = 0.99 # discount factor for reward
        decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
        resume = False # resume from previous checkpoint?
    
        model_bs = 3 # Batch size when learning from model
        real_bs = 3 # Batch size when learning from real environment
    
        # model initialization
        D = 4 # input dimensionality
    
        # Policy network:
        tf.reset_default_graph()
        observations = tf.placeholder(tf.float32, [None,4] , name="input_x")
        W1 = tf.get_variable("W1", shape=[4, H],
                initializer=tf.contrib.layers.xavier_initializer())
        layer1 = tf.nn.relu(tf.matmul(observations,W1))
        W2 = tf.get_variable("W2", shape=[H, 1],
                initializer=tf.contrib.layers.xavier_initializer())
        score = tf.matmul(layer1,W2)
        probability = tf.nn.sigmoid(score)
    
        tvars = tf.trainable_variables()
        input_y = tf.placeholder(tf.float32,[None,1], name="input_y")
        advantages = tf.placeholder(tf.float32,name="reward_signal")
        
        adam = tf.train.AdamOptimizer(learning_rate=learning_rate)
        W1Grad = tf.placeholder(tf.float32,name="batch_grad1")
        W2Grad = tf.placeholder(tf.float32,name="batch_grad2")
        batchGrad = [W1Grad,W2Grad]
        loglik = tf.log(input_y*(input_y - probability) + (1 - input_y)*(input_y + probability))
        loss = -tf.reduce_mean(loglik * advantages) 
        newGrads = tf.gradients(loss,tvars)
        updateGrads = adam.apply_gradients(zip(batchGrad,tvars))
    
    
        # Model network:
        mH = 256 # model layer size
    
        input_data = tf.placeholder(tf.float32, [None, 5])
        with tf.variable_scope('rnnlm'):
            softmax_w = tf.get_variable("softmax_w", [mH, 50])
            softmax_b = tf.get_variable("softmax_b", [50])
    
        previous_state = tf.placeholder(tf.float32, [None,5] , name="previous_state")
        W1M = tf.get_variable("W1M", shape=[5, mH],
                initializer=tf.contrib.layers.xavier_initializer())
        B1M = tf.Variable(tf.zeros([mH]),name="B1M")
        layer1M = tf.nn.relu(tf.matmul(previous_state,W1M) + B1M)
        W2M = tf.get_variable("W2M", shape=[mH, mH],
                initializer=tf.contrib.layers.xavier_initializer())
        B2M = tf.Variable(tf.zeros([mH]),name="B2M")
        layer2M = tf.nn.relu(tf.matmul(layer1M,W2M) + B2M)
        wO = tf.get_variable("wO", shape=[mH, 4],
                initializer=tf.contrib.layers.xavier_initializer())
        wR = tf.get_variable("wR", shape=[mH, 1],
                initializer=tf.contrib.layers.xavier_initializer())
        wD = tf.get_variable("wD", shape=[mH, 1],
                initializer=tf.contrib.layers.xavier_initializer())
    
        bO = tf.Variable(tf.zeros([4]),name="bO")
        bR = tf.Variable(tf.zeros([1]),name="bR")
        bD = tf.Variable(tf.ones([1]),name="bD")
    
    
        predicted_observation = tf.matmul(layer2M,wO,name="predicted_observation") + bO
        predicted_reward = tf.matmul(layer2M,wR,name="predicted_reward") + bR
        predicted_done = tf.sigmoid(tf.matmul(layer2M,wD,name="predicted_done") + bD)
    
        true_observation = tf.placeholder(tf.float32,[None,4],name="true_observation")
        true_reward = tf.placeholder(tf.float32,[None,1],name="true_reward")
        true_done = tf.placeholder(tf.float32,[None,1],name="true_done")
    
    
        predicted_state = tf.concat([predicted_observation,predicted_reward,predicted_done],1)
    
        observation_loss = tf.square(true_observation - predicted_observation)
    
        reward_loss = tf.square(true_reward - predicted_reward)
    
        done_loss = tf.multiply(predicted_done, true_done) + tf.multiply(1-predicted_done, 1-true_done)
        done_loss = -tf.log(done_loss)
    
        model_loss = tf.reduce_mean(observation_loss + done_loss + reward_loss)
    
        modelAdam = tf.train.AdamOptimizer(learning_rate=learning_rate)
        updateModel = modelAdam.minimize(model_loss)
    
        # helper functions:
        def resetGradBuffer(gradBuffer):
            for ix,grad in enumerate(gradBuffer):
                gradBuffer[ix] = grad * 0
            return gradBuffer
                
        def discount_rewards(r):
            """ take 1D float array of rewards and compute discounted reward """
            discounted_r = np.zeros_like(r)
            running_add = 0
            for t in reversed(range(0, r.size)):
                running_add = running_add * gamma + r[t]
                discounted_r[t] = running_add
            return discounted_r
    
    
        # This function uses our model to produce a new state when given a previous state and action
        def stepModel(sess, xs, action):
            toFeed = np.reshape(np.hstack([xs[-1][0],np.array(action)]),[1,5])
            myPredict = sess.run([predicted_state],feed_dict={previous_state: toFeed})
            # We should clip the mode reward here:
            reward = myPredict[0][:,4]
            if abs(reward)>1000.0:
                logDEBUG("Clipping model reward value %f" % reward)
                reward = np.clip(reward, -1000.0,1000.0)
    
            observation = myPredict[0][:,0:4]
            observation[:,0] = np.clip(observation[:,0],-2.4,2.4)
            observation[:,2] = np.clip(observation[:,2],-0.4,0.4)
            doneP = np.clip(myPredict[0][:,5],0,1)
            if doneP > 0.1 or len(xs)>= 300:
                done = True
            else:
                done = False
            return observation, reward, done
    
        # Training the model and policy together:
        xs,drs,ys,ds = [],[],[],[]
        running_reward = None
        reward_sum = 0
        episode_number = 1
        real_episodes = 1
        init = tf.global_variables_initializer()
        batch_size = real_bs
    
        drawFromModel = False # When set to True, will use model for observations
        trainTheModel = True # Whether to train the model
        trainThePolicy = False # Whether to train the policy
        switch_point = 1
    
        # Launch the graph
        with tf.Session() as sess:
            rendering = False
            sess.run(init)
            observation = env.reset()
            x = observation
            gradBuffer = sess.run(tvars)
            gradBuffer = resetGradBuffer(gradBuffer)
            
            while episode_number <= 5000:
                # Start displaying environment once performance is acceptably high.
                # No rendering support yet.
                # if (reward_sum/batch_size > 150 and drawFromModel == False) or rendering == True : 
                #     env.render()
                #     rendering = True
                    
                x = np.reshape(observation,[1,4])
    
                tfprob = sess.run(probability,feed_dict={observations: x})
                action = 1 if np.random.uniform() < tfprob else 0
    
                # record various intermediates (needed later for backprop)
                xs.append(x) 
                y = 1 if action == 0 else 0 
                ys.append(y)
                
                # step the  model or real environment and get new measurements
                if drawFromModel == False:
                    observation, reward, done, info = env.step(action)
                    CHECK(reward <= 1000.0, "Invalid reward value from env: %f"%reward)
                else:
                    observation, reward, done = stepModel(sess,xs,action)
                    CHECK(reward <= 1000.0, "Invalid reward value from model: %f"%reward)
                        
                reward_sum += reward
                
                ds.append(done*1)
                drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)
    
                if done: 
                    
                    if drawFromModel == False: 
                        real_episodes += 1
                    episode_number += 1
    
                    # stack together all inputs, hidden states, action gradients, and rewards for this episode
                    epx = np.vstack(xs)
                    epy = np.vstack(ys)
                    epr = np.vstack(drs)
                    epd = np.vstack(ds)
                    xs,drs,ys,ds = [],[],[],[] # reset array memory
                    
                    if trainTheModel == True:
                        actions = np.array([np.abs(y-1) for y in epy][:-1])
                        state_prevs = epx[:-1,:]
                        state_prevs = np.hstack([state_prevs,actions])
                        state_nexts = epx[1:,:]
                        rewards = np.array(epr[1:,:])
                        dones = np.array(epd[1:,:])
                        state_nextsAll = np.hstack([state_nexts,rewards,dones])
    
                        feed_dict={previous_state: state_prevs, true_observation: state_nexts,true_done:dones,true_reward:rewards}
                        loss,pState,_ = sess.run([model_loss,predicted_state,updateModel],feed_dict)
                    if trainThePolicy == True:
                        discounted_epr = discount_rewards(epr).astype('float32')
                        discounted_epr -= np.mean(discounted_epr)
                        dev = np.std(discounted_epr)
                        if dev > 0.0:
                            discounted_epr /= dev
                        
                        tGrad = sess.run(newGrads,feed_dict={observations: epx, input_y: epy, advantages: discounted_epr})
                        
                        # If gradients becom too large, end training process
                        if np.sum(tGrad[0] == tGrad[0]) == 0:
                            break
                        for ix,grad in enumerate(tGrad):
                            gradBuffer[ix] += grad
                        
                    if switch_point + batch_size == episode_number: 
                        switch_point = episode_number
                        if trainThePolicy == True:
                            sess.run(updateGrads,feed_dict={W1Grad: gradBuffer[0],W2Grad:gradBuffer[1]})
                            gradBuffer = resetGradBuffer(gradBuffer)
    
                        running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
                        if drawFromModel == False:
                            logDEBUG('World Perf: Episode %d, Reward %f, action: %d, mean reward %f.' % (real_episodes,reward_sum/real_bs,action, running_reward/real_bs))
                            if reward_sum/batch_size >= 200:
                                break
                        reward_sum = 0
    
                        # Once the model has been trained on 100 episodes, we start alternating between training the policy
                        # from the model and training the model from the real environment.
                        if episode_number > 100:
                            drawFromModel = not drawFromModel
                            trainTheModel = not trainTheModel
                            trainThePolicy = not trainThePolicy
                    
                    if drawFromModel == True:
                        observation = np.random.uniform(-0.1,0.1,[4]) # Generate reasonable starting point
                        batch_size = model_bs
                    else:
                        observation = env.reset()
                        batch_size = real_bs
    
        logDEBUG("Done with %d real episodes."% real_episodes)
    
  • This implementation works, except that I had to clamp the reward values predicted by the model network to some “reasonable value”:
        def stepModel(sess, xs, action):
            toFeed = np.reshape(np.hstack([xs[-1][0],np.array(action)]),[1,5])
            myPredict = sess.run([predicted_state],feed_dict={previous_state: toFeed})
            # We should clip the mode reward here:
            reward = myPredict[0][:,4]
            if abs(reward)>1000.0:
                logDEBUG("Clipping model reward value %f" % reward)
                reward = np.clip(reward, -1000.0,1000.0)
    
  • Because, we currently get very large reward predictions some times, such as:
    2019-03-09T21:46:49.130780 [DEBUG] Clipping model reward value 27099967488.000000
    2019-03-09T21:46:49.132562 [DEBUG] Clipping model reward value 29887655936.000000
    2019-03-09T21:46:49.186213 [DEBUG] World Perf: Episode 187, Reward 34.666667, action: 0, mean reward 1337.982666.
    2019-03-09T21:46:49.268631 [DEBUG] World Perf: Episode 190, Reward 34.333333, action: 1, mean reward 1311.774658.
  • And thus, without this clamping, the mean reward value gets completely “out of control”

Analysis

  • Nothing new to say on the policy network: it's exactly the same as the one we used in the post on Full policy Gradient agent
  • Let's rather focus on the model network.
  • Our input for this network has a dimension D=5
  • Actually, the following part of the code is useless (so I will remove it):
        input_data = tf.placeholder(tf.float32, [None, 5])
        with tf.variable_scope('rnnlm'):
            softmax_w = tf.get_variable("softmax_w", [mH, 50])
            softmax_b = tf.get_variable("softmax_b", [50])
    
  • For the model, we use 2 fully connected ReLU layers with mH=256 units each
  • Then we get 3 different outputs from those layers: the predicted next observation, next reward and next done state.
  • Then we compute the loss for each of those outputs, and we sum them before we can do the optimization step.
  • ⇒ The clever thing here is really that from a single network, we can consider that we “split the outputs” into conceptually separated components, which could be a very useful trick in general.
  • Then for the actual training we train the model and the policy network separately: the model is trained from inputs from the real environement, whereas the policy is trained from inputs from our model network, nothing too fancy here.