Differences
This shows you the differences between two versions of the page.
— | blog:2019:0309_model_based_policy_network [2020/07/10 12:11] (current) – created - external edit 127.0.0.1 | ||
---|---|---|---|
Line 1: | Line 1: | ||
+ | ====== Model-based policy networks ====== | ||
+ | |||
+ | {{tag> | ||
+ | |||
+ | Continuing on our reinforcement learning path, we will consider here how to build a "model for our environment" | ||
+ | |||
+ | ====== ====== | ||
+ | |||
+ | ===== References ===== | ||
+ | |||
+ | * [[https:// | ||
+ | * https:// | ||
+ | |||
+ | ===== Initial implementation ===== | ||
+ | |||
+ | * So our reference implementation is here: <sxh python> | ||
+ | import random | ||
+ | import gym | ||
+ | |||
+ | import tensorflow as tf | ||
+ | import tensorflow.contrib.slim as slim | ||
+ | import numpy as np | ||
+ | |||
+ | from nv.core.utils import * | ||
+ | |||
+ | # cf. https:// | ||
+ | |||
+ | def train_model_policy_network(): | ||
+ | logDEBUG(" | ||
+ | env = gym.make(' | ||
+ | | ||
+ | # hyperparameters | ||
+ | H = 8 # number of hidden layer neurons | ||
+ | learning_rate = 1e-2 | ||
+ | gamma = 0.99 # discount factor for reward | ||
+ | decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2 | ||
+ | resume = False # resume from previous checkpoint? | ||
+ | |||
+ | model_bs = 3 # Batch size when learning from model | ||
+ | real_bs = 3 # Batch size when learning from real environment | ||
+ | |||
+ | # model initialization | ||
+ | D = 4 # input dimensionality | ||
+ | |||
+ | # Policy network: | ||
+ | tf.reset_default_graph() | ||
+ | observations = tf.placeholder(tf.float32, | ||
+ | W1 = tf.get_variable(" | ||
+ | initializer=tf.contrib.layers.xavier_initializer()) | ||
+ | layer1 = tf.nn.relu(tf.matmul(observations, | ||
+ | W2 = tf.get_variable(" | ||
+ | initializer=tf.contrib.layers.xavier_initializer()) | ||
+ | score = tf.matmul(layer1, | ||
+ | probability = tf.nn.sigmoid(score) | ||
+ | |||
+ | tvars = tf.trainable_variables() | ||
+ | input_y = tf.placeholder(tf.float32, | ||
+ | advantages = tf.placeholder(tf.float32, | ||
+ | | ||
+ | adam = tf.train.AdamOptimizer(learning_rate=learning_rate) | ||
+ | W1Grad = tf.placeholder(tf.float32, | ||
+ | W2Grad = tf.placeholder(tf.float32, | ||
+ | batchGrad = [W1Grad, | ||
+ | loglik = tf.log(input_y*(input_y - probability) + (1 - input_y)*(input_y + probability)) | ||
+ | loss = -tf.reduce_mean(loglik * advantages) | ||
+ | newGrads = tf.gradients(loss, | ||
+ | updateGrads = adam.apply_gradients(zip(batchGrad, | ||
+ | |||
+ | |||
+ | # Model network: | ||
+ | mH = 256 # model layer size | ||
+ | |||
+ | input_data = tf.placeholder(tf.float32, | ||
+ | with tf.variable_scope(' | ||
+ | softmax_w = tf.get_variable(" | ||
+ | softmax_b = tf.get_variable(" | ||
+ | |||
+ | previous_state = tf.placeholder(tf.float32, | ||
+ | W1M = tf.get_variable(" | ||
+ | initializer=tf.contrib.layers.xavier_initializer()) | ||
+ | B1M = tf.Variable(tf.zeros([mH]), | ||
+ | layer1M = tf.nn.relu(tf.matmul(previous_state, | ||
+ | W2M = tf.get_variable(" | ||
+ | initializer=tf.contrib.layers.xavier_initializer()) | ||
+ | B2M = tf.Variable(tf.zeros([mH]), | ||
+ | layer2M = tf.nn.relu(tf.matmul(layer1M, | ||
+ | wO = tf.get_variable(" | ||
+ | initializer=tf.contrib.layers.xavier_initializer()) | ||
+ | wR = tf.get_variable(" | ||
+ | initializer=tf.contrib.layers.xavier_initializer()) | ||
+ | wD = tf.get_variable(" | ||
+ | initializer=tf.contrib.layers.xavier_initializer()) | ||
+ | |||
+ | bO = tf.Variable(tf.zeros([4]), | ||
+ | bR = tf.Variable(tf.zeros([1]), | ||
+ | bD = tf.Variable(tf.ones([1]), | ||
+ | |||
+ | |||
+ | predicted_observation = tf.matmul(layer2M, | ||
+ | predicted_reward = tf.matmul(layer2M, | ||
+ | predicted_done = tf.sigmoid(tf.matmul(layer2M, | ||
+ | |||
+ | true_observation = tf.placeholder(tf.float32, | ||
+ | true_reward = tf.placeholder(tf.float32, | ||
+ | true_done = tf.placeholder(tf.float32, | ||
+ | |||
+ | |||
+ | predicted_state = tf.concat([predicted_observation, | ||
+ | |||
+ | observation_loss = tf.square(true_observation - predicted_observation) | ||
+ | |||
+ | reward_loss = tf.square(true_reward - predicted_reward) | ||
+ | |||
+ | done_loss = tf.multiply(predicted_done, | ||
+ | done_loss = -tf.log(done_loss) | ||
+ | |||
+ | model_loss = tf.reduce_mean(observation_loss + done_loss + reward_loss) | ||
+ | |||
+ | modelAdam = tf.train.AdamOptimizer(learning_rate=learning_rate) | ||
+ | updateModel = modelAdam.minimize(model_loss) | ||
+ | |||
+ | # helper functions: | ||
+ | def resetGradBuffer(gradBuffer): | ||
+ | for ix,grad in enumerate(gradBuffer): | ||
+ | gradBuffer[ix] = grad * 0 | ||
+ | return gradBuffer | ||
+ | | ||
+ | def discount_rewards(r): | ||
+ | """ | ||
+ | discounted_r = np.zeros_like(r) | ||
+ | running_add = 0 | ||
+ | for t in reversed(range(0, | ||
+ | running_add = running_add * gamma + r[t] | ||
+ | discounted_r[t] = running_add | ||
+ | return discounted_r | ||
+ | |||
+ | |||
+ | # This function uses our model to produce a new state when given a previous state and action | ||
+ | def stepModel(sess, | ||
+ | toFeed = np.reshape(np.hstack([xs[-1][0], | ||
+ | myPredict = sess.run([predicted_state], | ||
+ | # We should clip the mode reward here: | ||
+ | reward = myPredict[0][:, | ||
+ | if abs(reward)> | ||
+ | logDEBUG(" | ||
+ | reward = np.clip(reward, | ||
+ | |||
+ | observation = myPredict[0][:, | ||
+ | observation[:, | ||
+ | observation[:, | ||
+ | doneP = np.clip(myPredict[0][:, | ||
+ | if doneP > 0.1 or len(xs)> | ||
+ | done = True | ||
+ | else: | ||
+ | done = False | ||
+ | return observation, | ||
+ | |||
+ | # Training the model and policy together: | ||
+ | xs, | ||
+ | running_reward = None | ||
+ | reward_sum = 0 | ||
+ | episode_number = 1 | ||
+ | real_episodes = 1 | ||
+ | init = tf.global_variables_initializer() | ||
+ | batch_size = real_bs | ||
+ | |||
+ | drawFromModel = False # When set to True, will use model for observations | ||
+ | trainTheModel = True # Whether to train the model | ||
+ | trainThePolicy = False # Whether to train the policy | ||
+ | switch_point = 1 | ||
+ | |||
+ | # Launch the graph | ||
+ | with tf.Session() as sess: | ||
+ | rendering = False | ||
+ | sess.run(init) | ||
+ | observation = env.reset() | ||
+ | x = observation | ||
+ | gradBuffer = sess.run(tvars) | ||
+ | gradBuffer = resetGradBuffer(gradBuffer) | ||
+ | | ||
+ | while episode_number <= 5000: | ||
+ | # Start displaying environment once performance is acceptably high. | ||
+ | # No rendering support yet. | ||
+ | # if (reward_sum/ | ||
+ | # | ||
+ | # | ||
+ | | ||
+ | x = np.reshape(observation, | ||
+ | |||
+ | tfprob = sess.run(probability, | ||
+ | action = 1 if np.random.uniform() < tfprob else 0 | ||
+ | |||
+ | # record various intermediates (needed later for backprop) | ||
+ | xs.append(x) | ||
+ | y = 1 if action == 0 else 0 | ||
+ | ys.append(y) | ||
+ | | ||
+ | # step the model or real environment and get new measurements | ||
+ | if drawFromModel == False: | ||
+ | observation, | ||
+ | CHECK(reward <= 1000.0, " | ||
+ | else: | ||
+ | observation, | ||
+ | CHECK(reward <= 1000.0, " | ||
+ | | ||
+ | reward_sum += reward | ||
+ | | ||
+ | ds.append(done*1) | ||
+ | drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action) | ||
+ | |||
+ | if done: | ||
+ | | ||
+ | if drawFromModel == False: | ||
+ | real_episodes += 1 | ||
+ | episode_number += 1 | ||
+ | |||
+ | # stack together all inputs, hidden states, action gradients, and rewards for this episode | ||
+ | epx = np.vstack(xs) | ||
+ | epy = np.vstack(ys) | ||
+ | epr = np.vstack(drs) | ||
+ | epd = np.vstack(ds) | ||
+ | xs, | ||
+ | | ||
+ | if trainTheModel == True: | ||
+ | actions = np.array([np.abs(y-1) for y in epy][:-1]) | ||
+ | state_prevs = epx[:-1,:] | ||
+ | state_prevs = np.hstack([state_prevs, | ||
+ | state_nexts = epx[1:,:] | ||
+ | rewards = np.array(epr[1:,: | ||
+ | dones = np.array(epd[1:,: | ||
+ | state_nextsAll = np.hstack([state_nexts, | ||
+ | |||
+ | feed_dict={previous_state: | ||
+ | loss, | ||
+ | if trainThePolicy == True: | ||
+ | discounted_epr = discount_rewards(epr).astype(' | ||
+ | discounted_epr -= np.mean(discounted_epr) | ||
+ | dev = np.std(discounted_epr) | ||
+ | if dev > 0.0: | ||
+ | discounted_epr /= dev | ||
+ | | ||
+ | tGrad = sess.run(newGrads, | ||
+ | | ||
+ | # If gradients becom too large, end training process | ||
+ | if np.sum(tGrad[0] == tGrad[0]) == 0: | ||
+ | break | ||
+ | for ix,grad in enumerate(tGrad): | ||
+ | gradBuffer[ix] += grad | ||
+ | | ||
+ | if switch_point + batch_size == episode_number: | ||
+ | switch_point = episode_number | ||
+ | if trainThePolicy == True: | ||
+ | sess.run(updateGrads, | ||
+ | gradBuffer = resetGradBuffer(gradBuffer) | ||
+ | |||
+ | running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01 | ||
+ | if drawFromModel == False: | ||
+ | logDEBUG(' | ||
+ | if reward_sum/ | ||
+ | break | ||
+ | reward_sum = 0 | ||
+ | |||
+ | # Once the model has been trained on 100 episodes, we start alternating between training the policy | ||
+ | # from the model and training the model from the real environment. | ||
+ | if episode_number > 100: | ||
+ | drawFromModel = not drawFromModel | ||
+ | trainTheModel = not trainTheModel | ||
+ | trainThePolicy = not trainThePolicy | ||
+ | | ||
+ | if drawFromModel == True: | ||
+ | observation = np.random.uniform(-0.1, | ||
+ | batch_size = model_bs | ||
+ | else: | ||
+ | observation = env.reset() | ||
+ | batch_size = real_bs | ||
+ | |||
+ | logDEBUG(" | ||
+ | </ | ||
+ | |||
+ | * This implementation works, except that I had to clamp the reward values predicted by the model network to some " | ||
+ | toFeed = np.reshape(np.hstack([xs[-1][0], | ||
+ | myPredict = sess.run([predicted_state], | ||
+ | # We should clip the mode reward here: | ||
+ | reward = myPredict[0][:, | ||
+ | if abs(reward)> | ||
+ | logDEBUG(" | ||
+ | reward = np.clip(reward, | ||
+ | </ | ||
+ | |||
+ | * Because, we currently get very large reward predictions some times, such as: < | ||
+ | 2019-03-09T21: | ||
+ | 2019-03-09T21: | ||
+ | 2019-03-09T21: | ||
+ | * And thus, without this clamping, the mean reward value gets completely "out of control" | ||
+ | |||
+ | /* | ||
+ | 2019-03-09T17: | ||
+ | 2019-03-09T17: | ||
+ | / | ||
+ | x = um.multiply(x, | ||
+ | 2019-03-09T17: | ||
+ | */ | ||
+ | |||
+ | ===== Analysis ===== | ||
+ | |||
+ | * Nothing new to say on the policy network: it's exactly the same as the one we used in the post on [[.0308_full_policy_gradient|Full policy Gradient agent]] | ||
+ | * Let's rather focus on the model network. | ||
+ | * Our input for this network has a dimension D=5 | ||
+ | * Actually, the following part of the code is useless (so I will remove it): <sxh python> | ||
+ | with tf.variable_scope(' | ||
+ | softmax_w = tf.get_variable(" | ||
+ | softmax_b = tf.get_variable(" | ||
+ | </ | ||
+ | * For the model, we use 2 fully connected ReLU layers with mH=256 units each | ||
+ | * Then we get 3 different outputs from those layers: the predicted next observation, | ||
+ | * Then we compute the loss for each of those outputs, and we sum them before we can do the optimization step. | ||
+ | |||
+ | * => The clever thing here is really that from a single network, we can consider that we "split the outputs" | ||
+ | |||
+ | * Then for the actual training we train the model and the policy network separately: the model is trained from inputs from the real environement, | ||
+ | |||
+ | |||