[python] python
Viewer
*** This page was generated with the meta tag "noindex, nofollow". This happened because you selected this option before saving or the system detected it as spam. This means that this page will never get into the search engines and the search bot will not crawl it. There is nothing to worry about, you can still share it with anyone.
- import argparse
- import numpy as np
- from environment import MountainCar, GridWorld
- """
- Please read: THE ENVIRONMENT INTERFACE
- In this homework, we provide the environment (either MountainCar or GridWorld)
- to you. The environment returns states, represented as 1D numpy arrays, rewards,
- and a Boolean flag indicating whether the episode has terminated. The environment
- accepts actions, represented as integers.
- The only file you need to modify/read is this one. We describe the environment
- interface below.
- class Environment: # either MountainCar or GridWorld
- def __init__(self, mode, debug=False):
- Initialize the environment with the mode, which can be either "raw"
- (for the raw state representation) or "tile" (for the tiled state
- representation). The raw state representation contains the position and
- velocity; the tile representation contains zeroes for the non-active
- tile indices and ones for the active indices. GridWorld must be used in
- tile mode. The debug flag will log additional information for you;
- make sure that this is turned off when you submit to the autograder.
- self.state_space = an integer representing the size of the state vector
- self.action_space = an integer representing the range for the valid actions
- You should make use of env.state_space and env.action_space when creating
- your weight matrix.
- def reset(self):
- Resets the environment to initial conditions. Returns:
- (1) state : A numpy array of size self.state_space, representing
- the initial state.
- def step(self, action):
- Updates itself based on the action taken. The action parameter is an
- integer in the range [0, 1, ..., self.action_space). Returns:
- (1) state : A numpy array of size self.state_space, representing
- the new state that the agent is in after taking its
- specified action.
- (2) reward : A float indicating the reward received at this step.
- (3) done : A Boolean flag indicating whether the episode has
- terminated; if this is True, you should reset the
- environment and move on to the next episode.
- def render(self, mode="human"):
- Renders the environment at the current step. Only supported for MountainCar.
- For example, for the GridWorld environment, you could do:
- env = GridWorld(mode="tile")
- Then, you can initialize your weight matrix to all zeroes with shape
- (env.action_space, env.state_space+1) (if you choose to fold the bias term in).
- Note that the states returned by the environment do *not* have the bias term
- folded in.
- """
- def parse_args() -> tuple:
- """
- Parses all args and returns them. Returns:
- (1) env_type : A string, either "mc" or "gw" indicating the type of
- environment you should use
- (2) mode : A string, either "raw" or "tile"
- (3) weight_out : The output path of the file containing your weights
- (4) returns_out : The output path of the file containing your returns
- (5) episodes : An integer indicating the number of episodes to train for
- (6) max_iterations : An integer representing the max number of iterations
- your agent should run in each episode
- (7) epsilon : A float representing the epsilon parameter for
- epsilon-greedy action selection
- (8) gamma : A float representing the discount factor gamma
- (9) lr : A float representing the learning rate
- Usage:
- env_type, mode, weight_out, returns_out, episodes, max_iterations, epsilon, gamma, lr = parse_args()
- """
- parser = argparse.ArgumentParser()
- parser.add_argument("env", type=str, choices=["mc", "gw"])
- parser.add_argument("mode", type=str, choices=["raw", "tile"])
- parser.add_argument("weight_out", type=str)
- parser.add_argument("returns_out", type=str)
- parser.add_argument("episodes", type=int)
- parser.add_argument("max_iterations", type=int)
- parser.add_argument("epsilon", type=float)
- parser.add_argument("gamma", type=float)
- parser.add_argument("learning_rate", type=float)
- args = parser.parse_args()
- return args.env, args.mode, args.weight_out, args.returns_out, args.episodes, args.max_iterations, args.epsilon, args.gamma, args.learning_rate
- def select_action(weights, state, epsilon, action_space):
- random_number = np.random.uniform(0, 1)
- picked_action = 0
- best_action = 0
- if random_number < epsilon:
- picked_action = np.random.randint(0, action_space)
- results = np.matmul(weights, state, dtype=np.float64)
- best_action = np.argmax(results)
- else:
- results = np.matmul(weights, state, dtype=np.float64)
- picked_action = np.argmax(results)
- best_action = np.argmax(results)
- temp_all = []
- for i in range(action_space):
- temp = np.dot(state, weights[i])
- temp_all.append(temp)
- temp_all = np.array(temp_all, dtype=np.float64)
- # print("results = ", results)
- # print("temp_all = ", temp_all)
- return picked_action, best_action
- def select_action_next(weights, state, epsilon, action_space):
- results = np.matmul(weights, state, dtype=np.float64)
- picked_action = np.argmax(results)
- return picked_action
- if __name__ == "__main__":
- env_type, mode, weight_out, returns_out, episodes, max_iterations, epsilon, gamma, lr = parse_args()
- if env_type == "mc":
- env = MountainCar(mode=mode)
- elif env_type == "gw":
- env = GridWorld(mode=mode)
- else: raise Exception(f"Invalid environment type {env_type}")
- weights = np.zeros((env.action_space, env.state_space+1), dtype=np.float64)
- all_rewards = []
- for episode in range(episodes):
- # Get the initial state by calling env.reset()
- cur_state = env.reset()
- cur_state = np.insert(cur_state, 0, 1) # bias's term's correspoding state value should be 1 at position 0
- cur_reward = 0
- for iteration in range(max_iterations):
- # Select an action based on the state via the epsilon-greedy strategy
- selected_action, best_action = select_action(weights, cur_state, epsilon, env.action_space)
- # Take a step in the environment with this action, and get the
- # returned next state, reward, and done flag
- next_state, reward, done = env.step(best_action)
- next_state = np.insert(next_state, 0, 1)
- # Using the original state, the action, the next state, and
- # the reward, update the parameters. Don't forget to update the
- # bias term!
- selected_action_next = select_action_next(weights, next_state, epsilon, env.action_space)
- weights[best_action] = weights[best_action] - lr * (np.dot(cur_state, weights[best_action]) - (reward + gamma * np.dot(next_state, weights[selected_action_next]))) * cur_state
- # weights[selected_action] = weights[selected_action] - lr * (np.dot(cur_state, weights[selected_action]) - (reward + gamma * np.dot(next_state, weights[selected_action_next]))) * cur_state
- cur_reward += reward
- # Remember to break out of this inner loop if the environment signals done!
- if done:
- break
- cur_state = next_state
- all_rewards.append(cur_reward)
- # Save your weights and returns. The reference solution uses
- # np.savetxt(..., fmt="%.18e", delimiter=" ")
- np.savetxt(returns_out, all_rewards, delimiter=' ')
- np.savetxt(weight_out, weights, delimiter=' ')
Editor
You can edit this paste and save as new: