Model-based policy networks

Model-based policy networks

Continuing on our reinforcement learning path, we will consider here how to build a “model for our environment” and then use this model to train our policy network, instead of the “actual environment”.

References

Initial implementation

So our reference implementation is here:

import numpy as np
import random
import gym

import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np

from nv.core.utils import *

# cf. https://medium.com/@awjuliani/simple-reinforcement-learning-with-tensorflow-part-3-model-based-rl-9a6fe0cce99

def train_model_policy_network():
    logDEBUG("Building environment...")
    env = gym.make('CartPole-v0')
    
    # hyperparameters
    H = 8 # number of hidden layer neurons
    learning_rate = 1e-2
    gamma = 0.99 # discount factor for reward
    decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
    resume = False # resume from previous checkpoint?

    model_bs = 3 # Batch size when learning from model
    real_bs = 3 # Batch size when learning from real environment

    # model initialization
    D = 4 # input dimensionality

    # Policy network:
    tf.reset_default_graph()
    observations = tf.placeholder(tf.float32, [None,4] , name="input_x")
    W1 = tf.get_variable("W1", shape=[4, H],
            initializer=tf.contrib.layers.xavier_initializer())
    layer1 = tf.nn.relu(tf.matmul(observations,W1))
    W2 = tf.get_variable("W2", shape=[H, 1],
            initializer=tf.contrib.layers.xavier_initializer())
    score = tf.matmul(layer1,W2)
    probability = tf.nn.sigmoid(score)

    tvars = tf.trainable_variables()
    input_y = tf.placeholder(tf.float32,[None,1], name="input_y")
    advantages = tf.placeholder(tf.float32,name="reward_signal")
    
    adam = tf.train.AdamOptimizer(learning_rate=learning_rate)
    W1Grad = tf.placeholder(tf.float32,name="batch_grad1")
    W2Grad = tf.placeholder(tf.float32,name="batch_grad2")
    batchGrad = [W1Grad,W2Grad]
    loglik = tf.log(input_y*(input_y - probability) + (1 - input_y)*(input_y + probability))
    loss = -tf.reduce_mean(loglik * advantages) 
    newGrads = tf.gradients(loss,tvars)
    updateGrads = adam.apply_gradients(zip(batchGrad,tvars))


    # Model network:
    mH = 256 # model layer size

    input_data = tf.placeholder(tf.float32, [None, 5])
    with tf.variable_scope('rnnlm'):
        softmax_w = tf.get_variable("softmax_w", [mH, 50])
        softmax_b = tf.get_variable("softmax_b", [50])

    previous_state = tf.placeholder(tf.float32, [None,5] , name="previous_state")
    W1M = tf.get_variable("W1M", shape=[5, mH],
            initializer=tf.contrib.layers.xavier_initializer())
    B1M = tf.Variable(tf.zeros([mH]),name="B1M")
    layer1M = tf.nn.relu(tf.matmul(previous_state,W1M) + B1M)
    W2M = tf.get_variable("W2M", shape=[mH, mH],
            initializer=tf.contrib.layers.xavier_initializer())
    B2M = tf.Variable(tf.zeros([mH]),name="B2M")
    layer2M = tf.nn.relu(tf.matmul(layer1M,W2M) + B2M)
    wO = tf.get_variable("wO", shape=[mH, 4],
            initializer=tf.contrib.layers.xavier_initializer())
    wR = tf.get_variable("wR", shape=[mH, 1],
            initializer=tf.contrib.layers.xavier_initializer())
    wD = tf.get_variable("wD", shape=[mH, 1],
            initializer=tf.contrib.layers.xavier_initializer())

    bO = tf.Variable(tf.zeros([4]),name="bO")
    bR = tf.Variable(tf.zeros([1]),name="bR")
    bD = tf.Variable(tf.ones([1]),name="bD")


    predicted_observation = tf.matmul(layer2M,wO,name="predicted_observation") + bO
    predicted_reward = tf.matmul(layer2M,wR,name="predicted_reward") + bR
    predicted_done = tf.sigmoid(tf.matmul(layer2M,wD,name="predicted_done") + bD)

    true_observation = tf.placeholder(tf.float32,[None,4],name="true_observation")
    true_reward = tf.placeholder(tf.float32,[None,1],name="true_reward")
    true_done = tf.placeholder(tf.float32,[None,1],name="true_done")


    predicted_state = tf.concat([predicted_observation,predicted_reward,predicted_done],1)

    observation_loss = tf.square(true_observation - predicted_observation)

    reward_loss = tf.square(true_reward - predicted_reward)

    done_loss = tf.multiply(predicted_done, true_done) + tf.multiply(1-predicted_done, 1-true_done)
    done_loss = -tf.log(done_loss)

    model_loss = tf.reduce_mean(observation_loss + done_loss + reward_loss)

    modelAdam = tf.train.AdamOptimizer(learning_rate=learning_rate)
    updateModel = modelAdam.minimize(model_loss)

    # helper functions:
    def resetGradBuffer(gradBuffer):
        for ix,grad in enumerate(gradBuffer):
            gradBuffer[ix] = grad * 0
        return gradBuffer
            
    def discount_rewards(r):
        """ take 1D float array of rewards and compute discounted reward """
        discounted_r = np.zeros_like(r)
        running_add = 0
        for t in reversed(range(0, r.size)):
            running_add = running_add * gamma + r[t]
            discounted_r[t] = running_add
        return discounted_r


    # This function uses our model to produce a new state when given a previous state and action
    def stepModel(sess, xs, action):
        toFeed = np.reshape(np.hstack([xs[-1][0],np.array(action)]),[1,5])
        myPredict = sess.run([predicted_state],feed_dict={previous_state: toFeed})
        # We should clip the mode reward here:
        reward = myPredict[0][:,4]
        if abs(reward)>1000.0:
            logDEBUG("Clipping model reward value %f" % reward)
            reward = np.clip(reward, -1000.0,1000.0)

        observation = myPredict[0][:,0:4]
        observation[:,0] = np.clip(observation[:,0],-2.4,2.4)
        observation[:,2] = np.clip(observation[:,2],-0.4,0.4)
        doneP = np.clip(myPredict[0][:,5],0,1)
        if doneP > 0.1 or len(xs)>= 300:
            done = True
        else:
            done = False
        return observation, reward, done

    # Training the model and policy together:
    xs,drs,ys,ds = [],[],[],[]
    running_reward = None
    reward_sum = 0
    episode_number = 1
    real_episodes = 1
    init = tf.global_variables_initializer()
    batch_size = real_bs

    drawFromModel = False # When set to True, will use model for observations
    trainTheModel = True # Whether to train the model
    trainThePolicy = False # Whether to train the policy
    switch_point = 1

    # Launch the graph
    with tf.Session() as sess:
        rendering = False
        sess.run(init)
        observation = env.reset()
        x = observation
        gradBuffer = sess.run(tvars)
        gradBuffer = resetGradBuffer(gradBuffer)
        
        while episode_number <= 5000:
            # Start displaying environment once performance is acceptably high.
            # No rendering support yet.
            # if (reward_sum/batch_size > 150 and drawFromModel == False) or rendering == True : 
            #     env.render()
            #     rendering = True
                
            x = np.reshape(observation,[1,4])

            tfprob = sess.run(probability,feed_dict={observations: x})
            action = 1 if np.random.uniform() < tfprob else 0

            # record various intermediates (needed later for backprop)
            xs.append(x) 
            y = 1 if action == 0 else 0 
            ys.append(y)
            
            # step the  model or real environment and get new measurements
            if drawFromModel == False:
                observation, reward, done, info = env.step(action)
                CHECK(reward <= 1000.0, "Invalid reward value from env: %f"%reward)
            else:
                observation, reward, done = stepModel(sess,xs,action)
                CHECK(reward <= 1000.0, "Invalid reward value from model: %f"%reward)
                    
            reward_sum += reward
            
            ds.append(done*1)
            drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

            if done: 
                
                if drawFromModel == False: 
                    real_episodes += 1
                episode_number += 1

                # stack together all inputs, hidden states, action gradients, and rewards for this episode
                epx = np.vstack(xs)
                epy = np.vstack(ys)
                epr = np.vstack(drs)
                epd = np.vstack(ds)
                xs,drs,ys,ds = [],[],[],[] # reset array memory
                
                if trainTheModel == True:
                    actions = np.array([np.abs(y-1) for y in epy][:-1])
                    state_prevs = epx[:-1,:]
                    state_prevs = np.hstack([state_prevs,actions])
                    state_nexts = epx[1:,:]
                    rewards = np.array(epr[1:,:])
                    dones = np.array(epd[1:,:])
                    state_nextsAll = np.hstack([state_nexts,rewards,dones])

                    feed_dict={previous_state: state_prevs, true_observation: state_nexts,true_done:dones,true_reward:rewards}
                    loss,pState,_ = sess.run([model_loss,predicted_state,updateModel],feed_dict)
                if trainThePolicy == True:
                    discounted_epr = discount_rewards(epr).astype('float32')
                    discounted_epr -= np.mean(discounted_epr)
                    dev = np.std(discounted_epr)
                    if dev > 0.0:
                        discounted_epr /= dev
                    
                    tGrad = sess.run(newGrads,feed_dict={observations: epx, input_y: epy, advantages: discounted_epr})
                    
                    # If gradients becom too large, end training process
                    if np.sum(tGrad[0] == tGrad[0]) == 0:
                        break
                    for ix,grad in enumerate(tGrad):
                        gradBuffer[ix] += grad
                    
                if switch_point + batch_size == episode_number: 
                    switch_point = episode_number
                    if trainThePolicy == True:
                        sess.run(updateGrads,feed_dict={W1Grad: gradBuffer[0],W2Grad:gradBuffer[1]})
                        gradBuffer = resetGradBuffer(gradBuffer)

                    running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
                    if drawFromModel == False:
                        logDEBUG('World Perf: Episode %d, Reward %f, action: %d, mean reward %f.' % (real_episodes,reward_sum/real_bs,action, running_reward/real_bs))
                        if reward_sum/batch_size >= 200:
                            break
                    reward_sum = 0

                    # Once the model has been trained on 100 episodes, we start alternating between training the policy
                    # from the model and training the model from the real environment.
                    if episode_number > 100:
                        drawFromModel = not drawFromModel
                        trainTheModel = not trainTheModel
                        trainThePolicy = not trainThePolicy
                
                if drawFromModel == True:
                    observation = np.random.uniform(-0.1,0.1,[4]) # Generate reasonable starting point
                    batch_size = model_bs
                else:
                    observation = env.reset()
                    batch_size = real_bs

    logDEBUG("Done with %d real episodes."% real_episodes)

This implementation works, except that I had to clamp the reward values predicted by the model network to some “reasonable value”:

    def stepModel(sess, xs, action):
        toFeed = np.reshape(np.hstack([xs[-1][0],np.array(action)]),[1,5])
        myPredict = sess.run([predicted_state],feed_dict={previous_state: toFeed})
        # We should clip the mode reward here:
        reward = myPredict[0][:,4]
        if abs(reward)>1000.0:
            logDEBUG("Clipping model reward value %f" % reward)
            reward = np.clip(reward, -1000.0,1000.0)

Because, we currently get very large reward predictions some times, such as:

2019-03-09T21:46:49.130780 [DEBUG] Clipping model reward value 27099967488.000000
2019-03-09T21:46:49.132562 [DEBUG] Clipping model reward value 29887655936.000000
2019-03-09T21:46:49.186213 [DEBUG] World Perf: Episode 187, Reward 34.666667, action: 0, mean reward 1337.982666.
2019-03-09T21:46:49.268631 [DEBUG] World Perf: Episode 190, Reward 34.333333, action: 1, mean reward 1311.774658.

And thus, without this clamping, the mean reward value gets completely “out of control”

Analysis

Nothing new to say on the policy network: it's exactly the same as the one we used in the post on Full policy Gradient agent
Let's rather focus on the model network.
Our input for this network has a dimension D=5

Actually, the following part of the code is useless (so I will remove it):

    input_data = tf.placeholder(tf.float32, [None, 5])
    with tf.variable_scope('rnnlm'):
        softmax_w = tf.get_variable("softmax_w", [mH, 50])
        softmax_b = tf.get_variable("softmax_b", [50])

For the model, we use 2 fully connected ReLU layers with mH=256 units each
Then we get 3 different outputs from those layers: the predicted next observation, next reward and next done state.
Then we compute the loss for each of those outputs, and we sum them before we can do the optimization step.

⇒ The clever thing here is really that from a single network, we can consider that we “split the outputs” into conceptually separated components, which could be a very useful trick in general.

Then for the actual training we train the model and the policy network separately: the model is trained from inputs from the real environement, whereas the policy is trained from inputs from our model network, nothing too fancy here.

Table of Contents

Model-based policy networks

References

Initial implementation

Analysis