Chapter 1 - 9: Solve OpenAI Gym’s Taxi-v2 Task
将下列的三段代码复制到以每个子标题命名的python文件中,三个文件放置在同一个文件夹下,然后
1 | python main.py |
即可开始训练
agent.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 | import numpy as np from collections import defaultdict class Agent: def __init__(self, nA=6): """ Initialize agent. Params ====== - nA: number of actions available to the agent - Q: state-action values matrix - epsilon: exploration rate of the epsilon-greedy strategy """ self.nA = nA self.Q = defaultdict(lambda: np.zeros(self.nA)) self.epsilon = 1 # randomly equivalently policy def update_Q(self, Qsa, Qsa_next, reward, alpha, gamma): """ updates the action-value function estimate using the most recent time step """ return Qsa + (alpha * (reward + (gamma * Qsa_next) - Qsa)) def epsilon_greedy_probs(self, Q_s, i_episode, eps=None): """obtains the action probabilities corresponding to epsilon greedy policy""" epsilon = 1.0 / i_episode if eps is not None: epsilon = eps policy_s = np.ones(self.nA) * epsilon / self.nA policy_s[np.argmax(Q_s)] = 1 - epsilon + (epsilon / self.nA) return policy_s def select_action(self, state, i_episode, eps=None): """ Given the state, select an action. Params ====== - state: the current state of the environment - i_episode: number of current episode Returns ======= - action: an integer, compatible with the task's action space """ # get epsilon-greedy action probabilitites. # print(eps) policy_s = self.epsilon_greedy_probs(self.Q[state], i_episode, eps) return np.random.choice(np.arange(self.nA), p=policy_s) def step(self, state, action, reward, next_state, done, i_episode, alpha=0.01, gamma=1): """ Update the agent's knowledge, using the most recently sampled tuple. Params ====== - state: the previous state of the environment - action: the agent's previous choice of action - reward: last reward received - next_state: the current state of the environment - done: whether the episode is complete (True or False) """ # self.Q[state][action] += alpha * (reward + (gamma * np.max(self.Q[next_state])) - self.Q[state][action]) # self.Q[state][action] += 1 # ''' Qsa = self.Q[state][action] policy_s = self.epsilon_greedy_probs(self.Q[next_state], i_episode) Qsa_next = np.dot(self.Q[next_state], policy_s) self.Q[state][action] = self.update_Q(Qsa, Qsa_next, reward, alpha, gamma) # ''' |
main.py
1 2 3 4 5 6 7 8 | from agent import Agent from monitor import interact import gym import numpy as np env = gym.make('Taxi-v3') agent = Agent() avg_rewards, best_avg_reward = interact(env, agent) |
monitor.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 | from collections import deque import sys import math import numpy as np def interact(env, agent, num_episodes=1000000, window=100): """ Monitor agent's performance. Params ====== - env: instance of OpenAI Gym's Taxi-v1 environment - agent: instance of class Agent (see Agent.py for details) - num_episodes: number of episodes of agent-environment interaction - window: number of episodes to consider when calculating average rewards Returns ======= - avg_rewards: deque containing average rewards - best_avg_reward: largest value in the avg_rewards deque """ # initialize average rewards avg_rewards = deque(maxlen=num_episodes) # initialize best average reward best_avg_reward = -math.inf # initialize monitor for most recent rewards samp_rewards = deque(maxlen=window) # for each episode for i_episode in range(1, num_episodes+1): # begin the episode state = env.reset() # initialize the sampled reward samp_reward = 0 while True: # agent selects an action action = agent.select_action(state, i_episode, eps=0.005) # agent performs the selected action next_state, reward, done, _ = env.step(action) # agent performs internal updates based on sampled experience agent.step(state, action, reward, next_state, done, i_episode) # update the sampled reward samp_reward += reward # update the state (s <- s') to next time step state = next_state if done: # save final sampled reward samp_rewards.append(samp_reward) break if (i_episode >= 100): # get average reward from last 100 episodes avg_reward = np.mean(samp_rewards) # append to deque avg_rewards.append(avg_reward) # update best average reward if avg_reward > best_avg_reward: best_avg_reward = avg_reward # monitor progress print("\rEpisode {}/{} || Best average reward {}".format(i_episode, num_episodes, best_avg_reward), end="") sys.stdout.flush() # check if task is solved (according to OpenAI Gym) if best_avg_reward >= 9.7: print('\nEnvironment solved in {} episodes.'.format(i_episode), end="") break if i_episode == num_episodes: print('\n') return avg_rewards, best_avg_reward |