Chapter 1 – 9: Solve OpenAI Gym’s Taxi-v2 Task

Chapter 1 - 9: Solve OpenAI Gym’s Taxi-v2 Task

将下列的三段代码复制到以每个子标题命名的python文件中，三个文件放置在同一个文件夹下，然后

1	python main.py

即可开始训练

agent.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70

import numpy as np
from collections import defaultdict

class Agent:

def __init__(self, nA=6):
""" Initialize agent.

Params
======
- nA: number of actions available to the agent
- Q: state-action values matrix
- epsilon: exploration rate of the epsilon-greedy strategy
"""
self.nA = nA
self.Q = defaultdict(lambda: np.zeros(self.nA))
self.epsilon = 1 # randomly equivalently policy

def update_Q(self, Qsa, Qsa_next, reward, alpha, gamma):
""" updates the action-value function estimate using the most recent time step """
return Qsa + (alpha * (reward + (gamma * Qsa_next) - Qsa))

def epsilon_greedy_probs(self, Q_s, i_episode, eps=None):
"""obtains the action probabilities corresponding to epsilon greedy policy"""
epsilon = 1.0 / i_episode
if eps is not None:
epsilon = eps
policy_s = np.ones(self.nA) * epsilon / self.nA
policy_s[np.argmax(Q_s)] = 1 - epsilon + (epsilon / self.nA)
return policy_s

def select_action(self, state, i_episode, eps=None):
""" Given the state, select an action.

Params
======
- state: the current state of the environment
- i_episode: number of current episode

Returns
=======
- action: an integer, compatible with the task's action space
"""
# get epsilon-greedy action probabilitites.
# print(eps)
policy_s = self.epsilon_greedy_probs(self.Q[state], i_episode, eps)

return np.random.choice(np.arange(self.nA), p=policy_s)

def step(self, state, action, reward, next_state, done, i_episode, alpha=0.01, gamma=1):
""" Update the agent's knowledge, using the most recently sampled tuple.

Params
======
- state: the previous state of the environment
- action: the agent's previous choice of action
- reward: last reward received
- next_state: the current state of the environment
- done: whether the episode is complete (True or False)
"""
# self.Q[state][action] += alpha * (reward + (gamma * np.max(self.Q[next_state])) - self.Q[state][action])
# self.Q[state][action] += 1
# '''
Qsa = self.Q[state][action]
policy_s = self.epsilon_greedy_probs(self.Q[next_state], i_episode)
Qsa_next = np.dot(self.Q[next_state], policy_s)

self.Q[state][action] = self.update_Q(Qsa, Qsa_next, reward, alpha, gamma)
# '''

main.py

1
2
3
4
5
6
7
8

from agent import Agent
from monitor import interact
import gym
import numpy as np

env = gym.make('Taxi-v3')
agent = Agent()
avg_rewards, best_avg_reward = interact(env, agent)

monitor.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64

from collections import deque
import sys
import math
import numpy as np

def interact(env, agent, num_episodes=1000000, window=100):
""" Monitor agent's performance.

Params
======
- env: instance of OpenAI Gym's Taxi-v1 environment
- agent: instance of class Agent (see Agent.py for details)
- num_episodes: number of episodes of agent-environment interaction
- window: number of episodes to consider when calculating average rewards

Returns
=======
- avg_rewards: deque containing average rewards
- best_avg_reward: largest value in the avg_rewards deque
"""
# initialize average rewards
avg_rewards = deque(maxlen=num_episodes)
# initialize best average reward
best_avg_reward = -math.inf
# initialize monitor for most recent rewards
samp_rewards = deque(maxlen=window)
# for each episode
for i_episode in range(1, num_episodes+1):
# begin the episode
state = env.reset()
# initialize the sampled reward
samp_reward = 0
while True:
# agent selects an action
action = agent.select_action(state, i_episode, eps=0.005)
# agent performs the selected action
next_state, reward, done, _ = env.step(action)
# agent performs internal updates based on sampled experience
agent.step(state, action, reward, next_state, done, i_episode)
# update the sampled reward
samp_reward += reward
# update the state (s <- s') to next time step
state = next_state
if done:
# save final sampled reward
samp_rewards.append(samp_reward)
break
if (i_episode >= 100):
# get average reward from last 100 episodes
avg_reward = np.mean(samp_rewards)
# append to deque
avg_rewards.append(avg_reward)
# update best average reward
if avg_reward > best_avg_reward:
best_avg_reward = avg_reward
# monitor progress
print("\rEpisode {}/{} || Best average reward {}".format(i_episode, num_episodes, best_avg_reward), end="")
sys.stdout.flush()
# check if task is solved (according to OpenAI Gym)
if best_avg_reward >= 9.7:
print('\nEnvironment solved in {} episodes.'.format(i_episode), end="")
break
if i_episode == num_episodes: print('\n')
return avg_rewards, best_avg_reward