In [4]:
import gym
import numpy
import tensorflow as tf
from collections import deque

env = gym.make('CartPole-v1')
env.reset()
for _ in range(100):
    env.render()
    env.step(env.action_space.sample()) # take a random action
env.close()

In [None]:
import gym
env = gym.make('Breakout-v0')
for i_episode in range(20):
    observation = env.reset()
    for t in range(100):
        env.render()
        print(observation)
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break
env.close()

**CartPole example**

**Initialization**

In [None]:
import gym
env = gym.make('CartPole-v1')
print(env.action_space)

In [None]:
print(env.observation_space)

In [None]:
print(env.observation_space.high)

In [None]:
print(env.observation_space.low)

https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py

In [5]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
from collections import deque
env = gym.make("CartPole-v1")
input_shape = [4] #=env.observation_space.shape
n_outputs = 2 # ==env.action_space.n

model = keras.Sequential([
    keras.layers.Dense(32, activation="relu", input_shape=input_shape),
    keras.layers.Dense(32, activation="relu"),
    keras.layers.Dense(n_outputs)
])

2023-04-14 17:15:08.423530: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


**Definition of the epsilon-greedy policy**

In [None]:
def epsilon_greedy_policy(state, epsilon=0):
    if np.random.rand() < epsilon:
        return np.random.randint(2)
    else:
        Q_values = model.predict(state[np.newaxis])
        return np.argmax(Q_values[0])

**Creation of the replay buffer**

In [None]:
replay_buffer = deque(maxlen=2000)

**Sampling experience**

In [None]:
def sample_experiences(batch_size):
    indices = np.random.randint(len(replay_buffer), size=batch_size)
    batch = [replay_buffer[index] for index in indices]
    states, actions, rewards, next_states, dones = [np.array([experience[field_index] for experience in batch])
    for field_index in range(5)]
    return states, actions, rewards, next_states, dones

**Network initialization**

In [None]:
batch_size = 32
discount_factor = 0.95
optimizer = keras.optimizers.Adam(lr=1e-3)
loss_fn = keras.losses.mean_squared_error

**Training initialization**

In [None]:
def training_step(batch_size):
    experiences = sample_experiences(batch_size)
    states, actions, rewards, next_states, dones = experiences
    next_Q_values = model.predict(next_states)
    max_next_Q_values = np.max(next_Q_values, axis=1)
    target_Q_values = (rewards + (1-dones)*discount_factor*max_next_Q_values)
    mask = tf.one_hot(actions, n_outputs)
    with tf.GradientTape() as tape:
        all_Q_values = model(states)
        Q_values = tf.reduce_sum(all_Q_values*mask, axis=1, keepdims=True)
        loss = tf.reduce_mean(loss_fn(target_Q_values,Q_values))
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

**Evaluation**

In [None]:
def play_one_step(env, state, epsilon):
    action = epsilon_greedy_policy(state, epsilon)
    next_state, reward, done, info = env.step(action)
    replay_buffer.append((state, action, reward, next_state, done))
    return next_state, reward, done, info

In [None]:
for episode in range(600):
    obs = env.reset()
    for step in range(200):
        epsilon = max(1- episode/500, 0.01)
        obs, reward, done, info = play_one_step(env, obs, epsilon)
        env.render()
        if done:
            break
        if episode>50:
            training_step(batch_size)