What is Q-Learning Algorithm
Q-Learning Algorithms in Action via Gynnasim Games
Q-Learning for FrozenLake-v1
import gymnasium as gym
import numpy as np
env = gym.make("FrozenLake-v1", is_slippery=False)
n_states = env.observation_space.n
n_actions = env.action_space.n
Q = np.zeros((n_states, n_actions))
alpha = 0.1
gamma = 0.99
epsilon = 0.1
episodes = 5000
def epsilon_greedy(state):
if np.random.random() < epsilon:
return env.action_space.sample()
return np.argmax(Q[state])
for ep in range(episodes):
state, _ = env.reset()
done = False
while not done:
action = epsilon_greedy(state)
next_state, reward, terminated, truncated, _ = env.step(action)
done = terminated or truncated
# Q-Learning update
Q[state, action] += alpha * (
reward + gamma * np.max(Q[next_state]) - Q[state, action]
)
state = next_state
print("Trained Q-table:\n", Q)
Cliff Walking (Classic RL Example)
import gymnasium as gym
import numpy as np
env = gym.make("CliffWalking-v0")
n_states = env.observation_space.n
n_actions = env.action_space.n
Q = np.zeros((n_states, n_actions))
alpha = 0.1
gamma = 0.9
epsilon = 0.1
episodes = 1000
def epsilon_greedy(state):
if np.random.random() < epsilon:
return env.action_space.sample()
return np.argmax(Q[state])
for ep in range(episodes):
state, _ = env.reset()
done = False
while not done:
action = epsilon_greedy(state)
next_state, reward, terminated, truncated, _ = env.step(action)
done = terminated or truncated
Q[state, action] += alpha * (
reward + gamma * np.max(Q[next_state]) - Q[state, action]
)
state = next_state
print("Trained Q-table for CliffWalking:\n", Q)
MountainCar-v0
import gymnasium as gym
import numpy as np
env = gym.make("MountainCar-v0")
n_actions = env.action_space.n
# Discretize the state space
pos_bins = np.linspace(-1.2, 0.6, 20)
vel_bins = np.linspace(-0.07, 0.07, 20)
def discretize(state):
pos, vel = state
pos_idx = np.digitize(pos, pos_bins) - 1
vel_idx = np.digitize(vel, vel_bins) - 1
return pos_idx, vel_idx
Q = np.zeros((20, 20, n_actions))
alpha = 0.1
gamma = 0.99
epsilon = 0.1
episodes = 10000
def epsilon_greedy(state):
if np.random.random() < epsilon:
return np.random.randint(0, n_actions)
return np.argmax(Q[state])
for ep in range(episodes):
obs, _ = env.reset()
state = discretize(obs)
done = False
while not done:
action = epsilon_greedy(state)
next_obs, reward, terminated, truncated, _ = env.step(action)
done = terminated or truncated
next_state = discretize(next_obs)
Q[state][action] += alpha * (
reward + gamma * np.max(Q[next_state]) - Q[state][action]
)
state = next_state
print("Trained Q-table shape:", Q.shape)