As mentioned in my previous post on this subject, I feel it could be worth investigating a bit further on this “Q table learning” algorithm. And more specifically, I want to try to introduce some kind of probabilistic action management in the system… Even if I'm not sure yet what kind of results this will give me. Let's just try and see
So I spent some time trying to build a probabilistic Q table learning mechanism and ended with this code:
def train_frozenlake_v2(numEpisodes, drate=7.0, stepCost=0.0, learningRate=0.8, eps=1.0, max_epsilon=1.0, min_epsilon=0.001): logDEBUG("Building FrozenLake environment...") env = gym.make('FrozenLake-v0') #Initialize table with all zeros nstates = env.observation_space.n nactions = env.action_space.n # We don't want to start with a zero value Q table: # We are going to consider those values as "probability of action" on each row # of the table, so they should represent correct probabilities: Q = np.ones([nstates,nactions])/float(nactions) logDEBUG("Qtable shape: %s" % str(Q.shape)) # Set learning parameters lr = learningRate # learning rate y = .95 # gamma (ie. discount rate) # numEpisodes = 2000 maxSteps = 99 # Exploration parameters epsilon = 1.0 # Exploration rate # decay_rate = drate/numEpisodes # Exponential decay rate for exploration prob decay_rate = drate # Exponential decay rate for exploration prob # decay rate detail: after numEpisodes episodes we reach a prob of exp(-7)~0.0009 actions = np.arange(nactions) # Array containing the total reward and number of steps per episode: rList = np.zeros((numEpisodes, 2)) for i in range(numEpisodes): #Reset environment and get first new observation state = env.reset() totalReward = 0 done = False step = 0 # logDEBUG("Performing episode %d/%d..." % (i, numEpisodes)) path = np.zeros((maxSteps, 4)) # cols: state, action, prob, reward #The Q-Table learning algorithm while step<maxSteps: # Check if we should do exploration or explotation: exp_thres = random.uniform(0, 1) action = None if exp_thres < epsilon: # We do exploration: action = env.action_space.sample() # logDEBUG("Using random action: %d" % action) else: # We do explotation, so we use our current Qtable # but here we should consider a probability sampling instead of an ordinary argmax: # action = np.argmax(Q[state,:]) action = np.random.choice(actions, size=None, replace=False, p=Q[state,:]) # Get new state and reward from environment newState,reward,done,info = env.step(action) # Update Q-Table with new knowledge using Bellman formula: # No, we don't want to update the Qtable right now, # instead we keep track of the action we selected for that state, # And we will decide later if this was a good choice or not and apply # the reward only at that time: path[step] = [float(state), float(action), Q[state, action], reward - stepCost] # Q[state,action] = Q[state,action] + lr*(reward - stepCost + y*np.max(Q[newState,:]) - Q[state,action]) # Update total reward: totalReward += (reward - stepCost) # Update state: state = newState step+=1 # Stop if we are done: if done == True: break # When done with the episode we apply the training: # We have to distribute the "total reward that we got", but we take into account the probability of the actions taken # into the process: # Remove the non-relevant data: spath = path[:step] # coop ratio, the lower the "less cooperative distribution" we see: weights = 1.0/(eps+spath[:, 2]) # normalize the weights: weights /= np.sum(weights) # logDEBUG("Normalized episode step weights: %s" % weights) # If the total reward is 0.0 then we apply a small penalty: # if totalReward == 0.0: # totalReward = -1.0 # Now we train the probability for each previously performed step: Q[spath[:,0].astype(int), spath[:,1].astype(int)] += lr*totalReward*weights # Finally, we should renormalize our probability distributions: Q = np.clip(Q, a_min=0.0, a_max=None) Q /= np.sum(Q, axis=1).reshape(-1, 1) # Then we reduce the exploration rate: epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*i) if (i+1)%1000==0: # logDEBUG("New epsilon value: %f" % epsilon) # Compute the mean reward: mvals = np.mean(rList[i-100:i], axis=0) # logDEBUG("Mean reward: %f" % mvals[0]) logDEBUG("%d/%d: Mean reward MA: %f" % (i+1, numEpisodes, mvals[0])) # env.render() # And we assign our data for visualization: rList[i] = [totalReward, step] logDEBUG("Final Qtable: %s" % np.around(Q,6)) # return the data array: return rList, Q
[[ 0. 0. 1. 0. ] [ 0. 0. 0. 1. ] [ 1. 0. 0. 0. ] [ 0.007265 0.492735 0.492735 0.007265] [ 0. 0. 1. 0. ] [ 0.25 0.25 0.25 0.25 ] [ 0. 0. 1. 0. ] [ 0.25 0.25 0.25 0.25 ] [ 0. 1. 0. 0. ] [ 0. 1. 0. 0. ] [ 0. 1. 0. 0. ] [ 0.25 0.25 0.25 0.25 ] [ 0.25 0.25 0.25 0.25 ] [ 0. 0. 0. 1. ] [ 0. 1. 0. 0. ] [ 0.25 0.25 0.25 0.25 ]]
And this is quite better than the reference implementation results below:
def train_frozenlake_v3(numEpisodes, drate=7.0, stepCost=0.0, learningRate=0.8, ename="FrozenLake-v0", lrDecay=0.0001): logDEBUG("Building FrozenLake environment...") env = gym.make(ename) #Initialize table with all zeros nstates = env.observation_space.n nactions = env.action_space.n Q = np.zeros([nstates,nactions]) logDEBUG("Qtable shape: %s" % str(Q.shape)) # Set learning parameters lr = learningRate # learning rate y = .95 # gamma (ie. discount rate) # numEpisodes = 2000 maxSteps = 99 # Exploration parameters epsilon = 1.0 # Exploration rate max_epsilon = 1.0 # Exploration probability at start min_epsilon = 0.01 # Minimum exploration probability # decay_rate = drate/numEpisodes # Exponential decay rate for exploration prob decay_rate = drate # Exponential decay rate for exploration prob # decay rate detail: after numEpisodes episodes we reach a prob of exp(-7)~0.0009 # Array containing the total reward and number of steps per episode: rList = np.zeros((numEpisodes, 2)) # Here we will also use a transition tensor: trans = np.zeros((nactions, nstates, nstates)) for i in range(numEpisodes): #Reset environment and get first new observation state = env.reset() totalReward = 0 done = False step = 0 # logDEBUG("Performing episode %d/%d..." % (i, numEpisodes)) steps = [] #The Q-Table learning algorithm while step<maxSteps: step+=1 # Check if we should do exploration or explotation: exp_thres = random.uniform(0, 1) action = None if exp_thres < epsilon: # We do exploration: action = env.action_space.sample() # logDEBUG("Using random action: %d" % action) else: # We do explotation, so we use our current Qtable: action = np.argmax(Q[state,:]) # logDEBUG("State %d: performing action %d" % (state, action)) # Get new state and reward from environment newState,reward,done,info = env.step(action) # We have done one step, so we update the transition matrix: trans[action, state, newState] += 1.0 # get the probabilistic next state weights: weights = trans[action, state, :] / np.sum(trans[action, state, :]) maxQs = np.max(Q, axis=1) # this is the current max Q for each state. # logDEBUG("weights: %s" % weights) # logDEBUG("mawQs: %s" % maxQs) # compute the dot product of the maxQ and weights: maxQ = np.dot(weights,maxQs) # logDEBUG("MaxQ: %s" % maxQ) # Update Q-Table with new knowledge using Bellman formula: # Q[state,action] = Q[state,action] + lr*(reward - stepCost + y*np.max(Q[newState,:]) - Q[state,action]) Q[state,action] = Q[state,action] + np.exp(-lrDecay*i)*lr*(reward - stepCost + y*maxQ - Q[state,action]) # Update total reward: totalReward += (reward - stepCost) # Update state: state = newState # Stop if we are done: if done == True: break # if totalReward > 0.0: # logDEBUG("Total reward: %f" % totalReward) # Then we reduce the exploration rate: epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*i) if (i+1)%1000==0: # logDEBUG("New epsilon value: %f" % epsilon) # logDEBUG("results: %s" % rList[i-100:i]) # Compute the mean reward: mvals = np.mean(rList[i-100:i], axis=0) # logDEBUG("Mean reward: %f" % mvals[0]) logDEBUG("%d/%d: Mean reward: %f" % (i+1, numEpisodes, mvals[0])) # env.render() # And we assign our data for visualization: rList[i] = [totalReward, step] logDEBUG("Final Qtable: %s" % Q) # return the data array: return rList, Q
def propagateQValue(Q, trans, state, reward, y): if abs(reward) <= 1e-6: return Q # logDEBUG("Propagating reward %f in state %d" % (reward, state)) weights = trans[: , :, state] / (np.sum(np.sum(trans[:, :, state])) + 1e-10) # the weights above are given by action and then state, so we should transpose the matrix: weights = np.transpose(weights) Q += reward * weights # We sum the weights for each state: weights = np.sum(weights, axis=1) * y * reward for i in range(weights.shape[0]): if i != state: Q = propagateQValue(Q, trans, i, weights[i], y) return Q def train_frozenlake_v4(numEpisodes, drate=7.0, stepCost=0.0, learningRate=0.8, ename="FrozenLake-v0", lrDecay=0.0001): logDEBUG("Building FrozenLake environment...") env = gym.make(ename) #Initialize table with all zeros nstates = env.observation_space.n nactions = env.action_space.n Q = np.zeros([nstates,nactions]) logDEBUG("Qtable shape: %s" % str(Q.shape)) # Set learning parameters lr = learningRate # learning rate y = .95 # gamma (ie. discount rate) # numEpisodes = 2000 maxSteps = 99 # Exploration parameters epsilon = 1.0 # Exploration rate max_epsilon = 1.0 # Exploration probability at start min_epsilon = 0.01 # Minimum exploration probability # decay_rate = drate/numEpisodes # Exponential decay rate for exploration prob decay_rate = drate # Exponential decay rate for exploration prob # decay rate detail: after numEpisodes episodes we reach a prob of exp(-7)~0.0009 # Array containing the total reward and number of steps per episode: rList = np.zeros((numEpisodes, 2)) # Here we will also use a transition tensor: trans = np.zeros((nactions, nstates, nstates)) for i in range(numEpisodes): #Reset environment and get first new observation state = env.reset() totalReward = 0 done = False step = 0 # logDEBUG("Performing episode %d/%d..." % (i, numEpisodes)) steps = [] #The Q-Table learning algorithm while step<maxSteps: step+=1 # Check if we should do exploration or explotation: exp_thres = random.uniform(0, 1) action = None if exp_thres < epsilon: # We do exploration: action = env.action_space.sample() # logDEBUG("Using random action: %d" % action) else: # We do explotation, so we use our current Qtable: action = np.argmax(Q[state,:]) # logDEBUG("State %d: performing action %d" % (state, action)) # Get new state and reward from environment newState,reward,done,info = env.step(action) # We have done one step, so we update the transition matrix: trans[action, state, newState] += 1.0 # get the probabilistic next state weights: # weights = trans[action, state, :] / np.sum(trans[action, state, :]) # maxQs = np.max(Q, axis=1) # this is the current max Q for each state. # logDEBUG("weights: %s" % weights) # logDEBUG("mawQs: %s" % maxQs) # compute the dot product of the maxQ and weights: # maxQ = np.dot(weights,maxQs) # logDEBUG("MaxQ: %s" % maxQ) # Update Q-Table with new knowledge using Bellman formula: # Q[state,action] = Q[state,action] + np.exp(-lrDecay*i)*lr*(reward - stepCost + y*maxQ - Q[state,action]) # if there is a reward to distribute (either positive or negative), we should consider how we reached this current (state/action) # And we should do so recursively: Q = propagateQValue(Q, trans, newState, np.exp(-lrDecay*i)*lr*reward, y) # if reward != 0.0: # weights = trans[: , :, state] / np.sum(np.sum(trans[:, :, state])) # # the weights above are given by action and then state, so we should transpose the matrix: # weights = np.transpose(weights) # Q += np.exp(-lrDecay*i)*lr*reward*weights # Update total reward: totalReward += (reward - stepCost) # Update state: state = newState # Stop if we are done: if done == True: break # Then we reduce the exploration rate: epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*i) if (i+1)%1000==0: # Compute the mean reward: mvals = np.mean(rList[i-100:i], axis=0) # logDEBUG("Mean reward: %f" % mvals[0]) logDEBUG("%d/%d: Mean reward: %f" % (i+1, numEpisodes, mvals[0])) # And we assign our data for visualization: rList[i] = [totalReward, step] logDEBUG("Final Qtable: %s" % Q) # return the data array: return rList, Q