Differences
This shows you the differences between two versions of the page.
— | blog:2019:0304_simple_qtable_learning [2020/07/10 12:11] (current) – created - external edit 127.0.0.1 | ||
---|---|---|---|
Line 1: | Line 1: | ||
+ | ====== Simple QTable learning ====== | ||
+ | |||
+ | {{tag> | ||
+ | |||
+ | Today, I feel like trying an implementation of a "Q table learning" | ||
+ | |||
+ | ====== ====== | ||
+ | |||
+ | ===== References ===== | ||
+ | |||
+ | As usual, here are the main references I'm using for this experiment: | ||
+ | |||
+ | * [[https:// | ||
+ | * [[https:// | ||
+ | |||
+ | ===== Prerequisites ===== | ||
+ | |||
+ | * First thing first, I need the OpenAI gym python module, so let's intall that: < | ||
+ | |||
+ | ===== FrozenLake application ===== | ||
+ | |||
+ | * So I first tried with this simple implementation: | ||
+ | import numpy as np | ||
+ | import random | ||
+ | |||
+ | from nv.core.utils import * | ||
+ | |||
+ | def train_frozenlake(numEpisodes): | ||
+ | logDEBUG(" | ||
+ | env = gym.make(' | ||
+ | |||
+ | #Initialize table with all zeros | ||
+ | nstates = env.observation_space.n | ||
+ | nactions = env.action_space.n | ||
+ | Q = np.zeros([nstates, | ||
+ | logDEBUG(" | ||
+ | |||
+ | # Set learning parameters | ||
+ | lr = .8 # learning rate | ||
+ | y = .95 # gamma (ie. discount rate) | ||
+ | |||
+ | # numEpisodes = 2000 | ||
+ | maxSteps = 99 | ||
+ | |||
+ | # Exploration parameters | ||
+ | epsilon = 1.0 # Exploration rate | ||
+ | max_epsilon = 1.0 # Exploration probability at start | ||
+ | min_epsilon = 0.01 # Minimum exploration probability | ||
+ | decay_rate = 7.0/ | ||
+ | # decay rate detail: after 1800 episodes we reach a prob of exp(-7)~0.0009 | ||
+ | |||
+ | # Array containing the total reward and number of steps per episode: | ||
+ | rList = np.zeros((numEpisodes, | ||
+ | |||
+ | for i in range(numEpisodes): | ||
+ | #Reset environment and get first new observation | ||
+ | state = env.reset() | ||
+ | | ||
+ | totalReward = 0 | ||
+ | done = False | ||
+ | step = 0 | ||
+ | |||
+ | # logDEBUG(" | ||
+ | |||
+ | #The Q-Table learning algorithm | ||
+ | while step< | ||
+ | step+=1 | ||
+ | |||
+ | # Check if we should do exploration or explotation: | ||
+ | exp_thres = random.uniform(0, | ||
+ | |||
+ | action = None | ||
+ | if exp_thres < epsilon: | ||
+ | # We do exploration: | ||
+ | action = env.action_space.sample() | ||
+ | else: | ||
+ | # We do explotation, | ||
+ | action = np.argmax(Q[state,: | ||
+ | |||
+ | # Get new state and reward from environment | ||
+ | newState, | ||
+ | |||
+ | # Update Q-Table with new knowledge using Bellman formula: | ||
+ | Q[state, | ||
+ | |||
+ | # Update total reward: | ||
+ | totalReward += reward | ||
+ | |||
+ | # Update state: | ||
+ | state = newState | ||
+ | |||
+ | # Stop if we are done: | ||
+ | if done == True: | ||
+ | break | ||
+ | | ||
+ | # Then we reduce the exploration rate: | ||
+ | epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*i) | ||
+ | |||
+ | # And we assign our data for visualization: | ||
+ | logDEBUG(" | ||
+ | rList[i] = [totalReward, | ||
+ | | ||
+ | # Compute the mean reward: | ||
+ | mvals = np.mean(rList, | ||
+ | logDEBUG(" | ||
+ | |||
+ | # return the data array: | ||
+ | return rList</ | ||
+ | | ||
+ | * And running this in jupyter with: <sxh python> | ||
+ | from nv.deep_learning.DQN_apps import train_frozenlake | ||
+ | import matplotlib.pyplot as plt | ||
+ | import pandas as pd | ||
+ | |||
+ | import os | ||
+ | os.environ[' | ||
+ | |||
+ | dataDir = os.environ[' | ||
+ | print(" | ||
+ | |||
+ | exp=" | ||
+ | res = train_frozenlake(1000) | ||
+ | |||
+ | print(res) | ||
+ | |||
+ | plt.figure(figsize = (18,9)) | ||
+ | plt.plot(range(res.shape[0]), | ||
+ | plt.plot(range(res.shape[0]), | ||
+ | plt.xlabel(' | ||
+ | plt.ylabel(' | ||
+ | plt.legend(fontsize=18) | ||
+ | # plt.grid(range(svals.shape[0]), | ||
+ | # for xc in range(0, | ||
+ | # | ||
+ | filename = dataDir+" | ||
+ | plt.savefig(filename) | ||
+ | plt.show()</ | ||
+ | |||
+ | * And I got this kind of display: | ||
+ | |||
+ | {{ projects: | ||
+ | |||
+ | * So... the only "total rewards" | ||
+ | |||
+ | < | ||
+ | |||
+ | * So now we should just try to train for a longer period to see if we can really improve our results. And with 20000 episodes we get this kind of results: | ||
+ | |||
+ | {{ projects: | ||
+ | |||
+ | {{ projects: | ||
+ | |||
+ | * => We can see that the reward is increasing progressively, | ||
+ | |||
+ | * With a more conventional decay setup (ie. quick decay to 0.01 value) we get tis kind of results: | ||
+ | |||
+ | {{ projects: | ||
+ | |||
+ | {{ projects: | ||
+ | |||
+ | * What I find strange with those results is that we don't seem to eventually reach a **perfect Qtable**, yet, if you think about this kind of problem it feels like if this should be possible... So could it be I have something going wrong here ? | ||
+ | |||
+ | => The typical Qtable array we get at the end of the training is something like that: < | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | |||
+ | |||
+ | And this is something I find strange in this case, because we could optimize this table much more and on each line we should see only one main non zero value. | ||
+ | |||
+ | ===== Conclusion ===== | ||
+ | |||
+ | => I'm not really satisfied with the Q Table training using the Bellman formula, and even if that may sond crazy, I have the feeling we could do better. So this is what I will try in a following post to clarify the situation. | ||