Differences
This shows you the differences between two versions of the page.
— | blog:2019:0305_prob_qtable_learning [2020/07/10 12:11] (current) – created - external edit 127.0.0.1 | ||
---|---|---|---|
Line 1: | Line 1: | ||
+ | ====== Probabilistic QTable learning ====== | ||
+ | |||
+ | {{tag> | ||
+ | |||
+ | As mentioned in my previous post on this subject, I feel it could be worth investigating a bit further on this "Q table learning" | ||
+ | And more specifically, | ||
+ | |||
+ | ====== ====== | ||
+ | |||
+ | ===== Initial model ===== | ||
+ | |||
+ | So I spent some time trying to build a probabilistic Q table learning mechanism and ended with this code: <sxh python> | ||
+ | max_epsilon=1.0, | ||
+ | logDEBUG(" | ||
+ | env = gym.make(' | ||
+ | |||
+ | #Initialize table with all zeros | ||
+ | nstates = env.observation_space.n | ||
+ | nactions = env.action_space.n | ||
+ | |||
+ | # We don't want to start with a zero value Q table: | ||
+ | # We are going to consider those values as " | ||
+ | # of the table, so they should represent correct probabilities: | ||
+ | Q = np.ones([nstates, | ||
+ | |||
+ | logDEBUG(" | ||
+ | |||
+ | # Set learning parameters | ||
+ | lr = learningRate | ||
+ | y = .95 # gamma (ie. discount rate) | ||
+ | |||
+ | # numEpisodes = 2000 | ||
+ | maxSteps = 99 | ||
+ | |||
+ | # Exploration parameters | ||
+ | epsilon = 1.0 # Exploration rate | ||
+ | # decay_rate = drate/ | ||
+ | decay_rate = drate # Exponential decay rate for exploration prob | ||
+ | # decay rate detail: after numEpisodes episodes we reach a prob of exp(-7)~0.0009 | ||
+ | |||
+ | actions = np.arange(nactions) | ||
+ | |||
+ | # Array containing the total reward and number of steps per episode: | ||
+ | rList = np.zeros((numEpisodes, | ||
+ | |||
+ | for i in range(numEpisodes): | ||
+ | #Reset environment and get first new observation | ||
+ | state = env.reset() | ||
+ | | ||
+ | totalReward = 0 | ||
+ | done = False | ||
+ | step = 0 | ||
+ | |||
+ | # logDEBUG(" | ||
+ | path = np.zeros((maxSteps, | ||
+ | |||
+ | #The Q-Table learning algorithm | ||
+ | while step< | ||
+ | # Check if we should do exploration or explotation: | ||
+ | exp_thres = random.uniform(0, | ||
+ | |||
+ | action = None | ||
+ | if exp_thres < epsilon: | ||
+ | # We do exploration: | ||
+ | action = env.action_space.sample() | ||
+ | # logDEBUG(" | ||
+ | else: | ||
+ | # We do explotation, | ||
+ | # but here we should consider a probability sampling instead of an ordinary argmax: | ||
+ | # action = np.argmax(Q[state,: | ||
+ | action = np.random.choice(actions, | ||
+ | |||
+ | # Get new state and reward from environment | ||
+ | newState, | ||
+ | |||
+ | # Update Q-Table with new knowledge using Bellman formula: | ||
+ | # No, we don't want to update the Qtable right now, | ||
+ | # instead we keep track of the action we selected for that state, | ||
+ | # And we will decide later if this was a good choice or not and apply | ||
+ | # the reward only at that time: | ||
+ | path[step] = [float(state), | ||
+ | |||
+ | # Q[state, | ||
+ | |||
+ | # Update total reward: | ||
+ | totalReward += (reward - stepCost) | ||
+ | |||
+ | # Update state: | ||
+ | state = newState | ||
+ | |||
+ | step+=1 | ||
+ | |||
+ | # Stop if we are done: | ||
+ | if done == True: | ||
+ | break | ||
+ | |||
+ | | ||
+ | |||
+ | # When done with the episode we apply the training: | ||
+ | # We have to distribute the "total reward that we got", but we take into account the probability of the actions taken | ||
+ | # into the process: | ||
+ | | ||
+ | # Remove the non-relevant data: | ||
+ | spath = path[:step] | ||
+ | |||
+ | # coop ratio, the lower the "less cooperative distribution" | ||
+ | weights = 1.0/ | ||
+ | |||
+ | # normalize the weights: | ||
+ | weights /= np.sum(weights) | ||
+ | # logDEBUG(" | ||
+ | | ||
+ | # If the total reward is 0.0 then we apply a small penalty: | ||
+ | # if totalReward == 0.0: | ||
+ | # | ||
+ | |||
+ | # Now we train the probability for each previously performed step: | ||
+ | Q[spath[:, | ||
+ | |||
+ | # Finally, we should renormalize our probability distributions: | ||
+ | Q = np.clip(Q, a_min=0.0, a_max=None) | ||
+ | |||
+ | Q /= np.sum(Q, axis=1).reshape(-1, | ||
+ | |||
+ | # Then we reduce the exploration rate: | ||
+ | epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*i) | ||
+ | |||
+ | if (i+1)%1000==0: | ||
+ | # logDEBUG(" | ||
+ | # Compute the mean reward: | ||
+ | mvals = np.mean(rList[i-100: | ||
+ | # logDEBUG(" | ||
+ | logDEBUG(" | ||
+ | # env.render() | ||
+ | |||
+ | # And we assign our data for visualization: | ||
+ | rList[i] = [totalReward, | ||
+ | | ||
+ | logDEBUG(" | ||
+ | |||
+ | # return the data array: | ||
+ | return rList, Q</ | ||
+ | |||
+ | * This was eventually producing "very sharp" Q tables, such as: < | ||
+ | [ 0. 0. 0. 1. ] | ||
+ | [ 1. 0. 0. 0. ] | ||
+ | [ 0.007265 | ||
+ | [ 0. 0. 1. 0. ] | ||
+ | [ 0.25 0.25 0.25 0.25 ] | ||
+ | [ 0. 0. 1. 0. ] | ||
+ | [ 0.25 0.25 0.25 0.25 ] | ||
+ | [ 0. 1. 0. 0. ] | ||
+ | [ 0. 1. 0. 0. ] | ||
+ | [ 0. 1. 0. 0. ] | ||
+ | [ 0.25 0.25 0.25 0.25 ] | ||
+ | [ 0.25 0.25 0.25 0.25 ] | ||
+ | [ 0. 0. 0. 1. ] | ||
+ | [ 0. 1. 0. 0. ] | ||
+ | [ 0.25 0.25 0.25 0.25 ]]</ | ||
+ | |||
+ | * **But** then the results I could get out of this Qtable were not any better than the previous implementation. So I was wondering how this could be possible because I thought the environment was deterministic... But I was wrong, since the environment description contains this indication: | ||
+ | |||
+ | < | ||
+ | |||
+ | * **Stupid you Manu** ;-) You just have read this more carefully from the beginning! | ||
+ | |||
+ | ===== Re-considering our implementation ===== | ||
+ | |||
+ | * So basically, our actions in the environment will have a non-deterministic result. So let's try with a **non-slippery version of the environment** first. | ||
+ | | ||
+ | * => And indeed, with a non-slippery environment, | ||
+ | |||
+ | {{ projects: | ||
+ | |||
+ | * But then, we can achieve the same kind of result with the bellman formula in such an environment: | ||
+ | |||
+ | {{ projects: | ||
+ | |||
+ | * **Note**: The " | ||
+ | * if slippery: It would take 3 sub-steps and each step with 33.33% probability with each of the 3 adjacent directions | ||
+ | * if not slippery: only take 1 step and with 100% probability on this direction | ||
+ | |||
+ | * So, the question is: is there something better we could do to training our Qtable given this new information ? Or should I just stick to that Bellman formula ? | ||
+ | |||
+ | * Actually, if we think about it: when we get a reward after doing action \(a_t\) in state \(S_t\), we update the **quality** of that action/ | ||
+ | |||
+ | * But, in this environment, | ||
+ | |||
+ | * => Why not instead try to " | ||
+ | |||
+ | * And now I'm thinking about something else: in the equation above, we use \( max Q' | ||
+ | |||
+ | * And well, I think this was a good move actually, using this " | ||
+ | |||
+ | {{ projects: | ||
+ | |||
+ | And this is quite better than the reference implementation results below: | ||
+ | |||
+ | {{ projects: | ||
+ | |||
+ | * So, the current implementation I'm using now (with transition matrix) is as follow: <sxh python> | ||
+ | logDEBUG(" | ||
+ | env = gym.make(ename) | ||
+ | |||
+ | #Initialize table with all zeros | ||
+ | nstates = env.observation_space.n | ||
+ | nactions = env.action_space.n | ||
+ | Q = np.zeros([nstates, | ||
+ | logDEBUG(" | ||
+ | |||
+ | # Set learning parameters | ||
+ | lr = learningRate | ||
+ | y = .95 # gamma (ie. discount rate) | ||
+ | |||
+ | # numEpisodes = 2000 | ||
+ | maxSteps = 99 | ||
+ | |||
+ | # Exploration parameters | ||
+ | epsilon = 1.0 # Exploration rate | ||
+ | max_epsilon = 1.0 # Exploration probability at start | ||
+ | min_epsilon = 0.01 # Minimum exploration probability | ||
+ | # decay_rate = drate/ | ||
+ | decay_rate = drate # Exponential decay rate for exploration prob | ||
+ | # decay rate detail: after numEpisodes episodes we reach a prob of exp(-7)~0.0009 | ||
+ | |||
+ | # Array containing the total reward and number of steps per episode: | ||
+ | rList = np.zeros((numEpisodes, | ||
+ | |||
+ | # Here we will also use a transition tensor: | ||
+ | trans = np.zeros((nactions, | ||
+ | |||
+ | for i in range(numEpisodes): | ||
+ | #Reset environment and get first new observation | ||
+ | state = env.reset() | ||
+ | | ||
+ | totalReward = 0 | ||
+ | done = False | ||
+ | step = 0 | ||
+ | |||
+ | # logDEBUG(" | ||
+ | steps = [] | ||
+ | |||
+ | #The Q-Table learning algorithm | ||
+ | while step< | ||
+ | step+=1 | ||
+ | |||
+ | # Check if we should do exploration or explotation: | ||
+ | exp_thres = random.uniform(0, | ||
+ | |||
+ | action = None | ||
+ | if exp_thres < epsilon: | ||
+ | # We do exploration: | ||
+ | action = env.action_space.sample() | ||
+ | # logDEBUG(" | ||
+ | else: | ||
+ | # We do explotation, | ||
+ | action = np.argmax(Q[state,: | ||
+ | |||
+ | # logDEBUG(" | ||
+ | |||
+ | # Get new state and reward from environment | ||
+ | newState, | ||
+ | |||
+ | # We have done one step, so we update the transition matrix: | ||
+ | trans[action, | ||
+ | |||
+ | # get the probabilistic next state weights: | ||
+ | weights = trans[action, | ||
+ | maxQs = np.max(Q, axis=1) | ||
+ | # logDEBUG(" | ||
+ | # logDEBUG(" | ||
+ | |||
+ | # compute the dot product of the maxQ and weights: | ||
+ | maxQ = np.dot(weights, | ||
+ | # logDEBUG(" | ||
+ | |||
+ | # Update Q-Table with new knowledge using Bellman formula: | ||
+ | # Q[state, | ||
+ | Q[state, | ||
+ | |||
+ | # Update total reward: | ||
+ | totalReward += (reward - stepCost) | ||
+ | |||
+ | # Update state: | ||
+ | state = newState | ||
+ | |||
+ | # Stop if we are done: | ||
+ | if done == True: | ||
+ | break | ||
+ | | ||
+ | # if totalReward > 0.0: | ||
+ | # | ||
+ | |||
+ | # Then we reduce the exploration rate: | ||
+ | epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*i) | ||
+ | |||
+ | if (i+1)%1000==0: | ||
+ | # logDEBUG(" | ||
+ | # logDEBUG(" | ||
+ | |||
+ | # Compute the mean reward: | ||
+ | mvals = np.mean(rList[i-100: | ||
+ | # logDEBUG(" | ||
+ | logDEBUG(" | ||
+ | # env.render() | ||
+ | |||
+ | # And we assign our data for visualization: | ||
+ | rList[i] = [totalReward, | ||
+ | | ||
+ | logDEBUG(" | ||
+ | |||
+ | # return the data array: | ||
+ | return rList, Q</ | ||
+ | |||
+ | * **Note**: Actually we could also **" | ||
+ | |||
+ | * Now, another step I think might also make a difference in the training is if we normalize the Q table for each row: this could given more " | ||
+ | |||
+ | |||
+ | ===== Recursive reward distribution ===== | ||
+ | |||
+ | * When we get a reward at a given step, currently this will only chance the value of the immediate (state, | ||
+ | |||
+ | * SO I tried the following code: <sxh python> | ||
+ | if abs(reward) <= 1e-6: | ||
+ | return Q | ||
+ | | ||
+ | # logDEBUG(" | ||
+ | |||
+ | weights = trans[: , :, state] / (np.sum(np.sum(trans[:, | ||
+ | |||
+ | # the weights above are given by action and then state, so we should transpose the matrix: | ||
+ | weights = np.transpose(weights) | ||
+ | |||
+ | Q += reward * weights | ||
+ | |||
+ | # We sum the weights for each state: | ||
+ | weights = np.sum(weights, | ||
+ | |||
+ | for i in range(weights.shape[0]): | ||
+ | if i != state: | ||
+ | Q = propagateQValue(Q, | ||
+ | | ||
+ | return Q | ||
+ | |||
+ | def train_frozenlake_v4(numEpisodes, | ||
+ | logDEBUG(" | ||
+ | env = gym.make(ename) | ||
+ | |||
+ | #Initialize table with all zeros | ||
+ | nstates = env.observation_space.n | ||
+ | nactions = env.action_space.n | ||
+ | Q = np.zeros([nstates, | ||
+ | logDEBUG(" | ||
+ | |||
+ | # Set learning parameters | ||
+ | lr = learningRate | ||
+ | y = .95 # gamma (ie. discount rate) | ||
+ | |||
+ | # numEpisodes = 2000 | ||
+ | maxSteps = 99 | ||
+ | |||
+ | # Exploration parameters | ||
+ | epsilon = 1.0 # Exploration rate | ||
+ | max_epsilon = 1.0 # Exploration probability at start | ||
+ | min_epsilon = 0.01 # Minimum exploration probability | ||
+ | # decay_rate = drate/ | ||
+ | decay_rate = drate # Exponential decay rate for exploration prob | ||
+ | # decay rate detail: after numEpisodes episodes we reach a prob of exp(-7)~0.0009 | ||
+ | |||
+ | # Array containing the total reward and number of steps per episode: | ||
+ | rList = np.zeros((numEpisodes, | ||
+ | |||
+ | # Here we will also use a transition tensor: | ||
+ | trans = np.zeros((nactions, | ||
+ | |||
+ | for i in range(numEpisodes): | ||
+ | #Reset environment and get first new observation | ||
+ | state = env.reset() | ||
+ | | ||
+ | totalReward = 0 | ||
+ | done = False | ||
+ | step = 0 | ||
+ | |||
+ | # logDEBUG(" | ||
+ | steps = [] | ||
+ | |||
+ | #The Q-Table learning algorithm | ||
+ | while step< | ||
+ | step+=1 | ||
+ | |||
+ | # Check if we should do exploration or explotation: | ||
+ | exp_thres = random.uniform(0, | ||
+ | |||
+ | action = None | ||
+ | if exp_thres < epsilon: | ||
+ | # We do exploration: | ||
+ | action = env.action_space.sample() | ||
+ | # logDEBUG(" | ||
+ | else: | ||
+ | # We do explotation, | ||
+ | action = np.argmax(Q[state,: | ||
+ | |||
+ | # logDEBUG(" | ||
+ | |||
+ | # Get new state and reward from environment | ||
+ | newState, | ||
+ | |||
+ | # We have done one step, so we update the transition matrix: | ||
+ | trans[action, | ||
+ | |||
+ | # get the probabilistic next state weights: | ||
+ | # weights = trans[action, | ||
+ | # maxQs = np.max(Q, axis=1) | ||
+ | # logDEBUG(" | ||
+ | # logDEBUG(" | ||
+ | |||
+ | # compute the dot product of the maxQ and weights: | ||
+ | # maxQ = np.dot(weights, | ||
+ | # logDEBUG(" | ||
+ | |||
+ | # Update Q-Table with new knowledge using Bellman formula: | ||
+ | # Q[state, | ||
+ | |||
+ | # if there is a reward to distribute (either positive or negative), we should consider how we reached this current (state/ | ||
+ | # And we should do so recursively: | ||
+ | Q = propagateQValue(Q, | ||
+ | |||
+ | # if reward != 0.0: | ||
+ | # | ||
+ | |||
+ | # # the weights above are given by action and then state, so we should transpose the matrix: | ||
+ | # | ||
+ | |||
+ | # Q += np.exp(-lrDecay*i)*lr*reward*weights | ||
+ | |||
+ | # Update total reward: | ||
+ | totalReward += (reward - stepCost) | ||
+ | |||
+ | # Update state: | ||
+ | state = newState | ||
+ | |||
+ | # Stop if we are done: | ||
+ | if done == True: | ||
+ | break | ||
+ | |||
+ | # Then we reduce the exploration rate: | ||
+ | epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*i) | ||
+ | |||
+ | if (i+1)%1000==0: | ||
+ | # Compute the mean reward: | ||
+ | mvals = np.mean(rList[i-100: | ||
+ | # logDEBUG(" | ||
+ | logDEBUG(" | ||
+ | |||
+ | # And we assign our data for visualization: | ||
+ | rList[i] = [totalReward, | ||
+ | | ||
+ | logDEBUG(" | ||
+ | |||
+ | # return the data array: | ||
+ | return rList, Q</ | ||
+ | |||
+ | * => Unfortunately, | ||
+ | |||
+ | {{ projects: | ||
+ | |||
+ | ===== What to learn from failed episodes ===== | ||
+ | |||
+ | * One thing we could also take out of this experiment is that, in this environment, | ||
+ | |||
+ | * But well, I feel I spent enough time on this "Q table" learning for now, and I want to move to a neural network implementation now, so I'll just stop here for now and see later if there is a need for me to come back to it. | ||
+ | |||