Q-Learning Algorithms

What is Q-Learning Algorithm

Q-Learning Algorithms in Action via Gynnasim Games

Q-Learning for FrozenLake-v1

import gymnasium as gym
import numpy as np

env = gym.make("FrozenLake-v1", is_slippery=False)
n_states = env.observation_space.n
n_actions = env.action_space.n

Q = np.zeros((n_states, n_actions))
alpha = 0.1
gamma = 0.99
epsilon = 0.1
episodes = 5000

def epsilon_greedy(state):
    if np.random.random() < epsilon:
        return env.action_space.sample()
    return np.argmax(Q[state])

for ep in range(episodes):
    state, _ = env.reset()
    done = False
    while not done:
        action = epsilon_greedy(state)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        # Q-Learning update
        Q[state, action] += alpha * (
            reward + gamma * np.max(Q[next_state]) - Q[state, action]
        )
        state = next_state

print("Trained Q-table:\n", Q)

Cliff Walking (Classic RL Example)

import gymnasium as gym
import numpy as np

env = gym.make("CliffWalking-v0")
n_states = env.observation_space.n
n_actions = env.action_space.n

Q = np.zeros((n_states, n_actions))
alpha = 0.1
gamma = 0.9
epsilon = 0.1
episodes = 1000

def epsilon_greedy(state):
    if np.random.random() < epsilon:
        return env.action_space.sample()
    return np.argmax(Q[state])

for ep in range(episodes):
    state, _ = env.reset()
    done = False
    while not done:
        action = epsilon_greedy(state)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        Q[state, action] += alpha * (
            reward + gamma * np.max(Q[next_state]) - Q[state, action]
        )
        state = next_state

print("Trained Q-table for CliffWalking:\n", Q)

MountainCar-v0

import gymnasium as gym
import numpy as np

env = gym.make("MountainCar-v0")
n_actions = env.action_space.n

# Discretize the state space
pos_bins = np.linspace(-1.2, 0.6, 20)
vel_bins = np.linspace(-0.07, 0.07, 20)

def discretize(state):
    pos, vel = state
    pos_idx = np.digitize(pos, pos_bins) - 1
    vel_idx = np.digitize(vel, vel_bins) - 1
    return pos_idx, vel_idx

Q = np.zeros((20, 20, n_actions))
alpha = 0.1
gamma = 0.99
epsilon = 0.1
episodes = 10000

def epsilon_greedy(state):
    if np.random.random() < epsilon:
        return np.random.randint(0, n_actions)
    return np.argmax(Q[state])

for ep in range(episodes):
    obs, _ = env.reset()
    state = discretize(obs)
    done = False

    while not done:
        action = epsilon_greedy(state)
        next_obs, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        next_state = discretize(next_obs)

        Q[state][action] += alpha * (
            reward + gamma * np.max(Q[next_state]) - Q[state][action]
        )

        state = next_state

print("Trained Q-table shape:", Q.shape)
Want to Receive Updates On Fastest AI Models, Successful AI Startups and New Hiring Candidates. Subscribe To My Newsletters
Subscribe