====== Model-based policy networks ====== {{tag>deep_learning}} Continuing on our reinforcement learning path, we will consider here how to build a "model for our environment" and then use this model to train our policy network, instead of the "actual environment". ====== ====== ===== References ===== * [[https://medium.com/@awjuliani/simple-reinforcement-learning-with-tensorflow-part-3-model-based-rl-9a6fe0cce99|Simple Reinforcement Learning with Tensorflow: Part 3 - Model-Based RL]] * https://github.com/awjuliani/DeepRL-Agents/blob/master/Model-Network.ipynb ===== Initial implementation ===== * So our reference implementation is here: import numpy as np import random import gym import tensorflow as tf import tensorflow.contrib.slim as slim import numpy as np from nv.core.utils import * # cf. https://medium.com/@awjuliani/simple-reinforcement-learning-with-tensorflow-part-3-model-based-rl-9a6fe0cce99 def train_model_policy_network(): logDEBUG("Building environment...") env = gym.make('CartPole-v0') # hyperparameters H = 8 # number of hidden layer neurons learning_rate = 1e-2 gamma = 0.99 # discount factor for reward decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2 resume = False # resume from previous checkpoint? model_bs = 3 # Batch size when learning from model real_bs = 3 # Batch size when learning from real environment # model initialization D = 4 # input dimensionality # Policy network: tf.reset_default_graph() observations = tf.placeholder(tf.float32, [None,4] , name="input_x") W1 = tf.get_variable("W1", shape=[4, H], initializer=tf.contrib.layers.xavier_initializer()) layer1 = tf.nn.relu(tf.matmul(observations,W1)) W2 = tf.get_variable("W2", shape=[H, 1], initializer=tf.contrib.layers.xavier_initializer()) score = tf.matmul(layer1,W2) probability = tf.nn.sigmoid(score) tvars = tf.trainable_variables() input_y = tf.placeholder(tf.float32,[None,1], name="input_y") advantages = tf.placeholder(tf.float32,name="reward_signal") adam = tf.train.AdamOptimizer(learning_rate=learning_rate) W1Grad = tf.placeholder(tf.float32,name="batch_grad1") W2Grad = tf.placeholder(tf.float32,name="batch_grad2") batchGrad = [W1Grad,W2Grad] loglik = tf.log(input_y*(input_y - probability) + (1 - input_y)*(input_y + probability)) loss = -tf.reduce_mean(loglik * advantages) newGrads = tf.gradients(loss,tvars) updateGrads = adam.apply_gradients(zip(batchGrad,tvars)) # Model network: mH = 256 # model layer size input_data = tf.placeholder(tf.float32, [None, 5]) with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [mH, 50]) softmax_b = tf.get_variable("softmax_b", [50]) previous_state = tf.placeholder(tf.float32, [None,5] , name="previous_state") W1M = tf.get_variable("W1M", shape=[5, mH], initializer=tf.contrib.layers.xavier_initializer()) B1M = tf.Variable(tf.zeros([mH]),name="B1M") layer1M = tf.nn.relu(tf.matmul(previous_state,W1M) + B1M) W2M = tf.get_variable("W2M", shape=[mH, mH], initializer=tf.contrib.layers.xavier_initializer()) B2M = tf.Variable(tf.zeros([mH]),name="B2M") layer2M = tf.nn.relu(tf.matmul(layer1M,W2M) + B2M) wO = tf.get_variable("wO", shape=[mH, 4], initializer=tf.contrib.layers.xavier_initializer()) wR = tf.get_variable("wR", shape=[mH, 1], initializer=tf.contrib.layers.xavier_initializer()) wD = tf.get_variable("wD", shape=[mH, 1], initializer=tf.contrib.layers.xavier_initializer()) bO = tf.Variable(tf.zeros([4]),name="bO") bR = tf.Variable(tf.zeros([1]),name="bR") bD = tf.Variable(tf.ones([1]),name="bD") predicted_observation = tf.matmul(layer2M,wO,name="predicted_observation") + bO predicted_reward = tf.matmul(layer2M,wR,name="predicted_reward") + bR predicted_done = tf.sigmoid(tf.matmul(layer2M,wD,name="predicted_done") + bD) true_observation = tf.placeholder(tf.float32,[None,4],name="true_observation") true_reward = tf.placeholder(tf.float32,[None,1],name="true_reward") true_done = tf.placeholder(tf.float32,[None,1],name="true_done") predicted_state = tf.concat([predicted_observation,predicted_reward,predicted_done],1) observation_loss = tf.square(true_observation - predicted_observation) reward_loss = tf.square(true_reward - predicted_reward) done_loss = tf.multiply(predicted_done, true_done) + tf.multiply(1-predicted_done, 1-true_done) done_loss = -tf.log(done_loss) model_loss = tf.reduce_mean(observation_loss + done_loss + reward_loss) modelAdam = tf.train.AdamOptimizer(learning_rate=learning_rate) updateModel = modelAdam.minimize(model_loss) # helper functions: def resetGradBuffer(gradBuffer): for ix,grad in enumerate(gradBuffer): gradBuffer[ix] = grad * 0 return gradBuffer def discount_rewards(r): """ take 1D float array of rewards and compute discounted reward """ discounted_r = np.zeros_like(r) running_add = 0 for t in reversed(range(0, r.size)): running_add = running_add * gamma + r[t] discounted_r[t] = running_add return discounted_r # This function uses our model to produce a new state when given a previous state and action def stepModel(sess, xs, action): toFeed = np.reshape(np.hstack([xs[-1][0],np.array(action)]),[1,5]) myPredict = sess.run([predicted_state],feed_dict={previous_state: toFeed}) # We should clip the mode reward here: reward = myPredict[0][:,4] if abs(reward)>1000.0: logDEBUG("Clipping model reward value %f" % reward) reward = np.clip(reward, -1000.0,1000.0) observation = myPredict[0][:,0:4] observation[:,0] = np.clip(observation[:,0],-2.4,2.4) observation[:,2] = np.clip(observation[:,2],-0.4,0.4) doneP = np.clip(myPredict[0][:,5],0,1) if doneP > 0.1 or len(xs)>= 300: done = True else: done = False return observation, reward, done # Training the model and policy together: xs,drs,ys,ds = [],[],[],[] running_reward = None reward_sum = 0 episode_number = 1 real_episodes = 1 init = tf.global_variables_initializer() batch_size = real_bs drawFromModel = False # When set to True, will use model for observations trainTheModel = True # Whether to train the model trainThePolicy = False # Whether to train the policy switch_point = 1 # Launch the graph with tf.Session() as sess: rendering = False sess.run(init) observation = env.reset() x = observation gradBuffer = sess.run(tvars) gradBuffer = resetGradBuffer(gradBuffer) while episode_number <= 5000: # Start displaying environment once performance is acceptably high. # No rendering support yet. # if (reward_sum/batch_size > 150 and drawFromModel == False) or rendering == True : # env.render() # rendering = True x = np.reshape(observation,[1,4]) tfprob = sess.run(probability,feed_dict={observations: x}) action = 1 if np.random.uniform() < tfprob else 0 # record various intermediates (needed later for backprop) xs.append(x) y = 1 if action == 0 else 0 ys.append(y) # step the model or real environment and get new measurements if drawFromModel == False: observation, reward, done, info = env.step(action) CHECK(reward <= 1000.0, "Invalid reward value from env: %f"%reward) else: observation, reward, done = stepModel(sess,xs,action) CHECK(reward <= 1000.0, "Invalid reward value from model: %f"%reward) reward_sum += reward ds.append(done*1) drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action) if done: if drawFromModel == False: real_episodes += 1 episode_number += 1 # stack together all inputs, hidden states, action gradients, and rewards for this episode epx = np.vstack(xs) epy = np.vstack(ys) epr = np.vstack(drs) epd = np.vstack(ds) xs,drs,ys,ds = [],[],[],[] # reset array memory if trainTheModel == True: actions = np.array([np.abs(y-1) for y in epy][:-1]) state_prevs = epx[:-1,:] state_prevs = np.hstack([state_prevs,actions]) state_nexts = epx[1:,:] rewards = np.array(epr[1:,:]) dones = np.array(epd[1:,:]) state_nextsAll = np.hstack([state_nexts,rewards,dones]) feed_dict={previous_state: state_prevs, true_observation: state_nexts,true_done:dones,true_reward:rewards} loss,pState,_ = sess.run([model_loss,predicted_state,updateModel],feed_dict) if trainThePolicy == True: discounted_epr = discount_rewards(epr).astype('float32') discounted_epr -= np.mean(discounted_epr) dev = np.std(discounted_epr) if dev > 0.0: discounted_epr /= dev tGrad = sess.run(newGrads,feed_dict={observations: epx, input_y: epy, advantages: discounted_epr}) # If gradients becom too large, end training process if np.sum(tGrad[0] == tGrad[0]) == 0: break for ix,grad in enumerate(tGrad): gradBuffer[ix] += grad if switch_point + batch_size == episode_number: switch_point = episode_number if trainThePolicy == True: sess.run(updateGrads,feed_dict={W1Grad: gradBuffer[0],W2Grad:gradBuffer[1]}) gradBuffer = resetGradBuffer(gradBuffer) running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01 if drawFromModel == False: logDEBUG('World Perf: Episode %d, Reward %f, action: %d, mean reward %f.' % (real_episodes,reward_sum/real_bs,action, running_reward/real_bs)) if reward_sum/batch_size >= 200: break reward_sum = 0 # Once the model has been trained on 100 episodes, we start alternating between training the policy # from the model and training the model from the real environment. if episode_number > 100: drawFromModel = not drawFromModel trainTheModel = not trainTheModel trainThePolicy = not trainThePolicy if drawFromModel == True: observation = np.random.uniform(-0.1,0.1,[4]) # Generate reasonable starting point batch_size = model_bs else: observation = env.reset() batch_size = real_bs logDEBUG("Done with %d real episodes."% real_episodes) * This implementation works, except that I had to clamp the reward values predicted by the model network to some "reasonable value": def stepModel(sess, xs, action): toFeed = np.reshape(np.hstack([xs[-1][0],np.array(action)]),[1,5]) myPredict = sess.run([predicted_state],feed_dict={previous_state: toFeed}) # We should clip the mode reward here: reward = myPredict[0][:,4] if abs(reward)>1000.0: logDEBUG("Clipping model reward value %f" % reward) reward = np.clip(reward, -1000.0,1000.0) * Because, we currently get very large reward predictions some times, such as: 2019-03-09T21:46:49.130780 [DEBUG] Clipping model reward value 27099967488.000000 2019-03-09T21:46:49.132562 [DEBUG] Clipping model reward value 29887655936.000000 2019-03-09T21:46:49.186213 [DEBUG] World Perf: Episode 187, Reward 34.666667, action: 0, mean reward 1337.982666. 2019-03-09T21:46:49.268631 [DEBUG] World Perf: Episode 190, Reward 34.333333, action: 1, mean reward 1311.774658. * And thus, without this clamping, the mean reward value gets completely "out of control" /* 2019-03-09T17:18:55.168245 [DEBUG] World Perf: Episode 850.000000. Reward 129.333333. action: 0.000000. mean reward 113.953552. 2019-03-09T17:18:55.630618 [DEBUG] World Perf: Episode 853.000000. Reward 138.666667. action: 0.000000. mean reward 89519814371966976.000000. /mnt/array1/dev/projects/NervSeed/tools/linux/python-3.6/lib/python3.6/site-packages/numpy/core/_methods.py:116: RuntimeWarning: overflow encountered in multiply x = um.multiply(x, x, out=x) 2019-03-09T17:18:56.648155 [DEBUG] World Perf: Episode 856.000000. Reward 151.666667. action: 0.000000. mean reward 101388646698265757089792.000000. */ ===== Analysis ===== * Nothing new to say on the policy network: it's exactly the same as the one we used in the post on [[.0308_full_policy_gradient|Full policy Gradient agent]] * Let's rather focus on the model network. * Our input for this network has a dimension D=5 * Actually, the following part of the code is useless (so I will remove it): input_data = tf.placeholder(tf.float32, [None, 5]) with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [mH, 50]) softmax_b = tf.get_variable("softmax_b", [50]) * For the model, we use 2 fully connected ReLU layers with mH=256 units each * Then we get 3 different outputs from those layers: the predicted next observation, next reward and next done state. * Then we compute the loss for each of those outputs, and we sum them before we can do the optimization step. * => The clever thing here is really that from a single network, we can consider that we "split the outputs" into conceptually separated components, which could be a very useful trick in general. * Then for the actual training we train the model and the policy network separately: the model is trained from inputs from the real environement, whereas the policy is trained from inputs from our model network, nothing too fancy here.