no way to compare when less than two revisions
Differences
This shows you the differences between two versions of the page.
— | blog:2019:0306_qnetwork_learning [2020/07/10 12:11] (current) – created - external edit 127.0.0.1 | ||
---|---|---|---|
Line 1: | Line 1: | ||
+ | ====== QNetwork learning ====== | ||
+ | |||
+ | {{tag> | ||
+ | |||
+ | Continuing on my current " | ||
+ | |||
+ | ====== ====== | ||
+ | |||
+ | ===== References ===== | ||
+ | |||
+ | * [[https:// | ||
+ | |||
+ | ===== Reference implementation ===== | ||
+ | |||
+ | * The reference implementation I will be using is as follow: <sxh python> | ||
+ | import numpy as np | ||
+ | import random | ||
+ | import tensorflow as tf | ||
+ | from nv.core.utils import * | ||
+ | |||
+ | def train_qn_env(numEpisodes=2000, | ||
+ | logDEBUG(" | ||
+ | env = gym.make(ename) | ||
+ | |||
+ | # Implementation of the network: | ||
+ | tf.reset_default_graph() | ||
+ | |||
+ | # These lines establish the feed-forward part of the network used to choose actions | ||
+ | inputs1 = tf.placeholder(shape=[1, | ||
+ | W = tf.Variable(tf.random_uniform([16, | ||
+ | Qout = tf.matmul(inputs1, | ||
+ | predict = tf.argmax(Qout, | ||
+ | |||
+ | # Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values. | ||
+ | nextQ = tf.placeholder(shape=[1, | ||
+ | loss = tf.reduce_sum(tf.square(nextQ - Qout)) | ||
+ | trainer = tf.train.GradientDescentOptimizer(learning_rate=0.1) | ||
+ | updateModel = trainer.minimize(loss) | ||
+ | |||
+ | # Actual training of the model: | ||
+ | logDEBUG(" | ||
+ | init=tf.global_variables_initializer() | ||
+ | |||
+ | # Set learning parameters | ||
+ | y = .99 | ||
+ | e = 0.1 | ||
+ | |||
+ | # Create lists to contain total rewards and steps per episode | ||
+ | jList = [] | ||
+ | rList = [] | ||
+ | | ||
+ | with tf.Session() as sess: | ||
+ | sess.run(init) | ||
+ | | ||
+ | for i in range(numEpisodes): | ||
+ | # Reset environment and get first new observation | ||
+ | s = env.reset() | ||
+ | rAll = 0 | ||
+ | d = False | ||
+ | j = 0 | ||
+ | | ||
+ | # The Q-Network | ||
+ | while j < 99: | ||
+ | j+=1 | ||
+ | | ||
+ | # Choose an action by greedily (with e chance of random action) from the Q-network | ||
+ | a,allQ = sess.run([predict, | ||
+ | if np.random.rand(1) < e: | ||
+ | a[0] = env.action_space.sample() | ||
+ | | ||
+ | # Get new state and reward from environment | ||
+ | s1,r,d,_ = env.step(a[0]) | ||
+ | | ||
+ | # Obtain the Q' values by feeding the new state through our network | ||
+ | Q1 = sess.run(Qout, | ||
+ | # Obtain maxQ' and set our target value for chosen action. | ||
+ | maxQ1 = np.max(Q1) | ||
+ | | ||
+ | targetQ = allQ | ||
+ | targetQ[0, | ||
+ | | ||
+ | # Train our network using target and predicted Q values | ||
+ | _,W1 = sess.run([updateModel, | ||
+ | rAll += r | ||
+ | s = s1 | ||
+ | | ||
+ | if d == True: | ||
+ | # Reduce chance of random action as we train the model. | ||
+ | e = 1./((i/50) + 10) | ||
+ | break | ||
+ | |||
+ | jList.append(j) | ||
+ | rList.append(rAll) | ||
+ | |||
+ | logDEBUG(" | ||
+ | |||
+ | logDEBUG(" | ||
+ | |||
+ | * So our reference results are as follow: | ||
+ | |||
+ | {{ projects: | ||
+ | |||
+ | {{ projects: | ||
+ | |||
+ | |||
+ | ===== Initial investigations ===== | ||
+ | |||
+ | * First thing we can notice on the previous implementation is that **it is slow**, much slower than the initial QTable version we played with... But this is not really surprising since we are " | ||
+ | |||
+ | * Let's start with a basic timing of the training session: the reference implementation above took about **81.27secs** to train on 2000 episodes. | ||
+ | |||
+ | ===== Cleaning session run calls ===== | ||
+ | |||
+ | * Currently, we start with a given state \(S_t\), and we feed that into out network to compute the actions value for that state: \(Q_{out} = S_t * W\). And then we continue performing each time a single change/ | ||
+ | |||
+ | * So I just refactored a little the implementation to remove non-needed calls to session.run() as follow: <sxh python> | ||
+ | logDEBUG(" | ||
+ | env = gym.make(ename) | ||
+ | |||
+ | # Implementation of the network: | ||
+ | tf.reset_default_graph() | ||
+ | |||
+ | # These lines establish the feed-forward part of the network used to choose actions | ||
+ | # inputs = tf.placeholder(shape=[1, | ||
+ | inputs = tf.placeholder_with_default(np.identity(16, | ||
+ | |||
+ | W = tf.Variable(tf.random_uniform([16, | ||
+ | Qout = tf.matmul(inputs, | ||
+ | |||
+ | # Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values. | ||
+ | nextQ = tf.placeholder(shape=[16, | ||
+ | loss = tf.reduce_sum(tf.square(nextQ - Qout)) | ||
+ | trainer = tf.train.GradientDescentOptimizer(learning_rate=0.1) | ||
+ | updateModel = trainer.minimize(loss) | ||
+ | |||
+ | # Actual training of the model: | ||
+ | logDEBUG(" | ||
+ | init=tf.global_variables_initializer() | ||
+ | |||
+ | # Set learning parameters | ||
+ | y = .99 | ||
+ | e = 0.1 | ||
+ | | ||
+ | # Array containing the total reward and number of steps per episode: | ||
+ | rList = np.zeros((numEpisodes, | ||
+ | |||
+ | with tf.Session() as sess: | ||
+ | sess.run(init) | ||
+ | | ||
+ | for i in range(numEpisodes): | ||
+ | # Reset environment and get first new observation | ||
+ | s = env.reset() | ||
+ | rAll = 0 | ||
+ | d = False | ||
+ | j = 0 | ||
+ | | ||
+ | action = None | ||
+ | |||
+ | # The Q-Network | ||
+ | while j < 99: | ||
+ | j+=1 | ||
+ | | ||
+ | # Choose an action by greedily (with e chance of random action) from the Q-network | ||
+ | allQ = sess.run(Qout, | ||
+ | | ||
+ | if np.random.rand(1) < e: | ||
+ | action = env.action_space.sample() | ||
+ | else: | ||
+ | action = np.argmax(allQ[s,: | ||
+ | | ||
+ | # Get new state and reward from environment | ||
+ | s1,r,d,_ = env.step(action) | ||
+ | | ||
+ | # Now we already have the Q' values since we evaluated all states above: | ||
+ | # Obtain maxQ' and set our target value for chosen action. | ||
+ | maxQ1 = np.max(allQ[s1,: | ||
+ | | ||
+ | targetQ = allQ | ||
+ | targetQ[s, | ||
+ | | ||
+ | # Train our network using target and predicted Q values | ||
+ | sess.run(updateModel, | ||
+ | rAll += r | ||
+ | s = s1 | ||
+ | | ||
+ | if d == True: | ||
+ | # Reduce chance of random action as we train the model. | ||
+ | e = 1./((i/50) + 10) | ||
+ | break | ||
+ | |||
+ | # And we assign our data for visualization: | ||
+ | rList[i] = [rAll, j] | ||
+ | |||
+ | if (i+1)%100==0: | ||
+ | mvals = np.mean(rList[i-99: | ||
+ | logDEBUG(" | ||
+ | | ||
+ | |||
+ | return rList</ | ||
+ | |||
+ | * => And with that we get approximately a **x2** speedup already. Let's continue now and try to figure out how to train multiple states at the same time... | ||
+ | |||
+ | ===== Multi-inputs training ===== | ||
+ | |||
+ | * What if we could train on multiple inputs at the same time ? | ||
+ | |||
+ | * Back to the **transition matrix** concept I introduced in my [[.0305_prob_qtable_learning|previous post on Q tables]]: I would like to try to introduce the same idea here: basically at each step, we use the knowledge we have to update the \(Q_{next}\) matrix to contain the value: \(r+ \gamma Q^n_{max}(S_{t+1}, | ||
+ | |||
+ | * So, the third implementation I tried was the following: <sxh python> | ||
+ | logDEBUG(" | ||
+ | env = gym.make(ename) | ||
+ | |||
+ | nstates = env.observation_space.n | ||
+ | nactions = env.action_space.n | ||
+ | |||
+ | # Implementation of the network: | ||
+ | tf.reset_default_graph() | ||
+ | |||
+ | # These lines establish the feed-forward part of the network used to choose actions | ||
+ | # inputs = tf.placeholder(shape=[1, | ||
+ | inputs = tf.placeholder_with_default(np.identity(16, | ||
+ | |||
+ | W = tf.Variable(tf.random_uniform([16, | ||
+ | Qout = tf.matmul(inputs, | ||
+ | |||
+ | # Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values. | ||
+ | nextQ = tf.placeholder(shape=[16, | ||
+ | loss = tf.reduce_sum(tf.square(nextQ - Qout)) | ||
+ | trainer = tf.train.GradientDescentOptimizer(learning_rate=0.1) | ||
+ | updateModel = trainer.minimize(loss) | ||
+ | |||
+ | # Actual training of the model: | ||
+ | logDEBUG(" | ||
+ | init=tf.global_variables_initializer() | ||
+ | |||
+ | # Set learning parameters | ||
+ | y = .99 | ||
+ | e = 0.1 | ||
+ | | ||
+ | # Array containing the total reward and number of steps per episode: | ||
+ | rList = np.zeros((numEpisodes, | ||
+ | |||
+ | # Here we will also use a state transition tensor: | ||
+ | tProbs = np.zeros((nstates, | ||
+ | |||
+ | # And a reward probability matrix: | ||
+ | # Note: the first column of this matrix tells us how many | ||
+ | # observations we have, and the second column tells us what | ||
+ | # is the current total reward we got so far with this path. | ||
+ | rProbs = np.zeros((nstates, | ||
+ | |||
+ | # Target Q matrix: | ||
+ | targetQ = np.zeros((nstates, | ||
+ | |||
+ | # epsilon value: | ||
+ | eps=1e-10 | ||
+ | |||
+ | with tf.Session() as sess: | ||
+ | sess.run(init) | ||
+ | | ||
+ | for i in range(numEpisodes): | ||
+ | # Reset environment and get first new observation | ||
+ | s = env.reset() | ||
+ | rAll = 0 | ||
+ | d = False | ||
+ | j = 0 | ||
+ | | ||
+ | action = None | ||
+ | |||
+ | # The Q-Network | ||
+ | while j < 99: | ||
+ | j+=1 | ||
+ | | ||
+ | # Choose an action by greedily (with e chance of random action) from the Q-network | ||
+ | allQ = sess.run(Qout, | ||
+ | | ||
+ | if np.random.rand(1) < e: | ||
+ | action = env.action_space.sample() | ||
+ | else: | ||
+ | action = np.argmax(allQ[s,: | ||
+ | | ||
+ | # Get new state and reward from environment | ||
+ | s1,r,d,_ = env.step(action) | ||
+ | | ||
+ | # Fill our transitions and reward matrices: | ||
+ | tProbs[s, action, s1] += 1.0 | ||
+ | rProbs[s, action, :] += [1.0, r] | ||
+ | |||
+ | # Now we update all the nextQ values together: | ||
+ | # For each (S,a) pair in the targetQ matrix, we need to set the value | ||
+ | # to: mean_reward + y * expected_next_state value: | ||
+ | # So we start with the mean reward: | ||
+ | targetQ = rProbs[:,:, | ||
+ | | ||
+ | # Compute the transition weights from each state: | ||
+ | weights = tProbs/ | ||
+ | | ||
+ | # So for each (S, a) pair, we have the nstates weights for each possible following states | ||
+ | # in the weight matrix above. | ||
+ | # we need to dot product this with the max Q' | ||
+ | maxQs = np.max(allQ, | ||
+ | |||
+ | targetQ += y * np.dot(weights, | ||
+ | | ||
+ | # Train our network using target and predicted Q values | ||
+ | sess.run(updateModel, | ||
+ | rAll += r | ||
+ | s = s1 | ||
+ | | ||
+ | if d == True: | ||
+ | # Reduce chance of random action as we train the model. | ||
+ | e = 1./((i/50) + 10) | ||
+ | break | ||
+ | |||
+ | # And we assign our data for visualization: | ||
+ | rList[i] = [rAll, j] | ||
+ | |||
+ | if (i+1)%100==0: | ||
+ | mvals = np.mean(rList[i-99: | ||
+ | logDEBUG(" | ||
+ | | ||
+ | |||
+ | return rList</ | ||
+ | |||
+ | * And the results seem to be significantly better than with the previous implementations: | ||
+ | |||
+ | {{ projects: | ||
+ | |||
+ | ===== Recursive Q target update ===== | ||
+ | |||
+ | * The next step I would like to take now that we can update all the Q values together in our network would be to try to improve the quality of the **Q target matrix**: in the code above, we build this target matrix with **1 update cycle**, using the values currently available in our Q value matrix... but then if we consider that this is the targetQ we would "like to reach", | ||
+ | |||
+ | * So I introduced the following function to compute the targetQ matrix recursively (but we need a counter, because it's not really converging): | ||
+ | # Now we update all the nextQ values together: | ||
+ | # For each (S,a) pair in the targetQ matrix, we need to set the value | ||
+ | # to: mean_reward + y * expected_next_state value: | ||
+ | # So we start with the mean reward: | ||
+ | targetQ = rProbs[:,:, | ||
+ | | ||
+ | # Compute the transition weights from each state: | ||
+ | weights = tProbs/ | ||
+ | | ||
+ | # So for each (S, a) pair, we have the nstates weights for each possible following states | ||
+ | # in the weight matrix above. | ||
+ | # we need to dot product this with the max Q' | ||
+ | maxQs = np.max(sourceQ, | ||
+ | |||
+ | targetQ += y * np.dot(weights, | ||
+ | |||
+ | # Compute the difference between the source and target matrices: | ||
+ | tval = np.sum(np.sum(np.abs(targetQ))) | ||
+ | sval = np.sum(np.sum(np.abs(sourceQ))) | ||
+ | if tval==sval: | ||
+ | return targetQ | ||
+ | | ||
+ | diff = 2.0*abs(tval - sval)/ | ||
+ | |||
+ | if diff > threshold and counter> | ||
+ | # logDEBUG(" | ||
+ | return updateQTarget(targetQ, | ||
+ | | ||
+ | return targetQ</ | ||
+ | |||
+ | * Then I update the targetQ matrix from the current allQ matrix in the training loop: <sxh python> | ||
+ | tProbs[s, action, s1] += 1.0 | ||
+ | rProbs[s, action, :] += [1.0, r] | ||
+ | |||
+ | targetQ = updateQTarget(allQ) | ||
+ | | ||
+ | # Train our network using target and predicted Q values | ||
+ | sess.run(updateModel, | ||
+ | rAll += r | ||
+ | s = s1</ | ||
+ | |||
+ | * With this implementation, | ||
+ | |||
+ | {{ projects: | ||
+ | |||
+ | * => This seems just a little bit less good than with the non-recursive target Q matrix computation. But one thing I noticed already is that the recursive computation is not converging :-( And that's not quite what I would be expecting... | ||
+ | |||
+ | * So what can we do to make it converge ? | ||
+ | |||
+ | * Let's rethink a bit on how we can compute our \(Q(S,a)\) values: first, we will now consider that we give a " | ||
+ | | ||
+ | * Actually we could even consider that the value depends on the path we take to reach that state: so the value we compute becomes \(Q(S_t, | ||
+ | |||
+ | * Next we need to compute Q(S,a) : this action can lead us to other states \(S_i\), each with a probability \(p_i\) and the reward we can expect in that case is the mean reward that we can extract from our \(Q(S, a, S_i)\) tensor described above. | ||
+ | |||
+ | * So we have \[Q(S,a) = \sum_i p_i \times \left(\frac{Q[S, | ||
+ | |||
+ | with: \[p_i = \frac{Q[S, a, S_i, 0]}{\sum_j Q[S, a, S_j, 0]} \] | ||
+ | |||
+ | * Then we can compute the \(Q(S_i)\) values as: \(Q(S_i) = {max}_a(Q(S_i, | ||
+ | * Then we can recursively recompute the \(Q(S_i, | ||
+ | |||
+ | * Let's clarify this even further: | ||
+ | * We call \(Q^t_{i, | ||
+ | * We call \(R_{i, | ||
+ | * We call \(Q^s_k\) the intrinsic value of the state k | ||
+ | * We call \(p_{i, | ||
+ | |||
+ | * We those definitions our formula above become: | ||
+ | |||
+ | \[Q^t_{i,j} = \sum_k p_{i,j,k} \left( R_{i,j,k} + \gamma Q^s_k \right)\] | ||
+ | |||
+ | * Then we should separate the " | ||
+ | |||
+ | \[Q^t_{i,j} = \sum_{k \neq i} p_{i,j,k} \left( R_{i,j,k} + \gamma Q^s_k \right) | ||
+ | |||
+ | * Now, by definition we have: \(Q^s_i = max_{a_j}(Q^t_{i, | ||
+ | |||
+ | * So the idea is to update all the \(Q^t_{i, | ||
+ | /* | ||
+ | * So, this max value has to verify: | ||
+ | |||
+ | \[ Q^s_i = \underset{j}{max}(Q^t_{i, | ||
+ | */ | ||
+ | |||
+ | * But unfortunately, | ||
+ | targetQ /= np.sum(targetQ, | ||
+ | | ||
+ | * And indeed, with this change it seems we are converging, but then the result we can achieve from this are **not good**. So this path is not really worth it. | ||
+ | |||
+ | * => Anyway, never mind :-)! Time to move to a new experiment now ;-)! | ||