Today, I feel like trying an implementation of a “Q table learning”. Of course, the idea is to go much further than this, but we have to start the the basis, right ? So let's begin.
As usual, here are the main references I'm using for this experiment:
nv_py_call_pip install gym
import gym import numpy as np import random from nv.core.utils import * def train_frozenlake(numEpisodes): logDEBUG("Building FrozenLake environment...") env = gym.make('FrozenLake-v0') #Initialize table with all zeros nstates = env.observation_space.n nactions = env.action_space.n Q = np.zeros([nstates,nactions]) logDEBUG("Qtable shape: %s" % str(Q.shape)) # Set learning parameters lr = .8 # learning rate y = .95 # gamma (ie. discount rate) # numEpisodes = 2000 maxSteps = 99 # Exploration parameters epsilon = 1.0 # Exploration rate max_epsilon = 1.0 # Exploration probability at start min_epsilon = 0.01 # Minimum exploration probability decay_rate = 7.0/1800.0 # Exponential decay rate for exploration prob # decay rate detail: after 1800 episodes we reach a prob of exp(-7)~0.0009 # Array containing the total reward and number of steps per episode: rList = np.zeros((numEpisodes, 2)) for i in range(numEpisodes): #Reset environment and get first new observation state = env.reset() totalReward = 0 done = False step = 0 # logDEBUG("Performing episode %d/%d..." % (i, numEpisodes)) #The Q-Table learning algorithm while step<maxSteps: step+=1 # Check if we should do exploration or explotation: exp_thres = random.uniform(0, 1) action = None if exp_thres < epsilon: # We do exploration: action = env.action_space.sample() else: # We do explotation, so we use our current Qtable: action = np.argmax(Q[state,:]) # Get new state and reward from environment newState,reward,done,info = env.step(action) # Update Q-Table with new knowledge using Bellman formula: Q[state,action] = Q[state,action] + lr*(reward + y*np.max(Q[newState,:]) - Q[state,action]) # Update total reward: totalReward += reward # Update state: state = newState # Stop if we are done: if done == True: break # Then we reduce the exploration rate: epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*i) # And we assign our data for visualization: logDEBUG("%d/%d: Total reward: %f" % (i+1, numEpisodes, totalReward)) rList[i] = [totalReward, step] # Compute the mean reward: mvals = np.mean(rList, axis=0) logDEBUG("Mean reward: %f" % mvals[0]) # return the data array: return rList
from nv.core.utils import * from nv.deep_learning.DQN_apps import train_frozenlake import matplotlib.pyplot as plt import pandas as pd import os os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1' dataDir = os.environ['NVSEED_DATA_DIR']+"/" print("Data dir: ", dataDir) exp="1-first_trial" res = train_frozenlake(1000) print(res) plt.figure(figsize = (18,9)) plt.plot(range(res.shape[0]),res[:,0],color='b',label='Reward') plt.plot(range(res.shape[0]),res[:,1],color='orange',label='Num steps') plt.xlabel('Iteration') plt.ylabel('Episode data') plt.legend(fontsize=18) # plt.grid(range(svals.shape[0]),axis='x', color='r', linestyle='-', linewidth=1) # for xc in range(0,svals.shape[0]+1, lfreq): # plt.axvline(x=xc, color='k', linestyle='--', linewidth=1) filename = dataDir+"deep_learning/tests/qtable/%s.png" % (exp) plt.savefig(filename) plt.show()
⇒ The typical Qtable array we get at the end of the training is something like that:
[[ 8.45356320e-02 3.21190753e-02 8.59557936e-02 8.57750436e-02] [ 1.38008125e-02 1.58740568e-03 1.44658989e-02 8.69052154e-02] [ 1.00856435e-02 2.02433213e-03 8.28338186e-03 7.49612561e-02] [ 2.57502785e-03 1.08999445e-02 4.77150803e-07 7.53541474e-02] [ 9.62490068e-02 4.54455221e-02 9.53828356e-03 5.80393039e-02] [ 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00] [ 1.89685598e-01 1.84186277e-04 2.81778415e-06 2.27085696e-08] [ 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00] [ 4.37804809e-02 1.20205573e-01 8.06634119e-03 2.38534679e-01] [ 1.65409985e-02 7.52591077e-01 1.53091012e-03 1.23874446e-03] [ 2.63804747e-01 3.71528815e-03 3.09251232e-02 5.45746045e-03] [ 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00] [ 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00] [ 2.64244622e-01 2.16863746e-03 5.37192468e-01 8.89917016e-02] [ 3.64068530e-01 4.66030522e-01 1.19812849e-01 4.60581823e-01] [ 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]]
And this is something I find strange in this case, because we could optimize this table much more and on each line we should see only one main non zero value.
=> I'm not really satisfied with the Q Table training using the Bellman formula, and even if that may sond crazy, I have the feeling we could do better. So this is what I will try in a following post to clarify the situation.