Hemant Vishwakarma: Simple DQN to slow to train

I have been trying to solve the OpenAI lunar lander game with a DQN taken from this paper

https://arxiv.org/pdf/2006.04938v2.pdf

The issue is that it takes 12 hours to train 50 episodes so something must be wrong.

import os
import random
import gym
import numpy as np
from collections import deque
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import Model

ENV_NAME = "LunarLander-v2"

DISCOUNT_FACTOR = 0.9
LEARNING_RATE = 0.001

MEMORY_SIZE = 2000
TRAIN_START = 1000
BATCH_SIZE = 24

EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.01
EXPLORATION_DECAY = 0.99

class MyModel(Model):
    def __init__(self, input_size, output_size):
        super(MyModel, self).__init__()
        self.d1 = Dense(128, input_shape=(input_size,), activation="relu")
        self.d2 = Dense(128, activation="relu")
        self.d3 = Dense(output_size, activation="linear")

    def call(self, x):
        x = self.d1(x)
        x = self.d2(x)
        return self.d3(x)

class DQNSolver():

    def __init__(self, observation_space, action_space):
        self.exploration_rate = EXPLORATION_MAX

        self.action_space = action_space
        self.memory = deque(maxlen=MEMORY_SIZE)

        self.model = MyModel(observation_space,action_space)
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() < self.exploration_rate:
            return random.randrange(self.action_space)
        q_values = self.model.predict(state)
        return np.argmax(q_values[0])

    def experience_replay(self):
        if len(self.memory) < BATCH_SIZE:
            return
        batch = random.sample(self.memory, BATCH_SIZE)
        state_batch, q_values_batch = [], []
        for state, action, reward, state_next, terminal in batch:
            # q-value prediction for a given state
            q_values_cs = self.model.predict(state)
            # target q-value
            max_q_value_ns = np.amax(self.model.predict(state_next)[0])
            # correction on the Q value for the action used
            if terminal:
                q_values_cs[0][action] = reward
            else:
                q_values_cs[0][action] = reward + DISCOUNT_FACTOR * max_q_value_ns
            state_batch.append(state[0])
            q_values_batch.append(q_values_cs[0])
        # train the Q network
        self.model.fit(np.array(state_batch),
                        np.array(q_values_batch),
                        batch_size = BATCH_SIZE,
                        epochs = 1, verbose = 0)
        self.exploration_rate *= EXPLORATION_DECAY
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)

def lunar_lander():
    env = gym.make(ENV_NAME)
    observation_space = env.observation_space.shape[0]
    action_space = env.action_space.n
    dqn_solver = DQNSolver(observation_space, action_space)
    episode = 0
    print("Running")
    while True:
        episode += 1
        state = env.reset()
        state = np.reshape(state, [1, observation_space])
        scores = []
        score = 0
        while True:
            action = dqn_solver.act(state)
            state_next, reward, terminal, _ = env.step(action)
            state_next = np.reshape(state_next, [1, observation_space])
            dqn_solver.remember(state, action, reward, state_next, terminal)
            dqn_solver.experience_replay()
            state = state_next
            score += reward
            if terminal:
                print("Episode: " + str(episode) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(score))
                scores.append(score)
                break
        if np.mean(scores[-min(100, len(scores)):]) >= 195:
            print("Problem is solved in {} episodes.".format(episode))
            break
    env.close
if __name__ == "__main__":
    lunar_lander()

Here are the logs

root@b11438e3d3e8:~# /usr/bin/python3 /root/test.py
2021-01-03 13:42:38.055593: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
2021-01-03 13:42:39.338231: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1
2021-01-03 13:42:39.368192: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-01-03 13:42:39.368693: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce GTX 1080 computeCapability: 6.1
coreClock: 1.8095GHz coreCount: 20 deviceMemorySize: 7.92GiB deviceMemoryBandwidth: 298.32GiB/s
2021-01-03 13:42:39.368729: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
2021-01-03 13:42:39.370269: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10
2021-01-03 13:42:39.371430: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10
2021-01-03 13:42:39.371704: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10
2021-01-03 13:42:39.373318: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10
2021-01-03 13:42:39.374243: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10
2021-01-03 13:42:39.377939: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7
2021-01-03 13:42:39.378118: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-01-03 13:42:39.378702: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-01-03 13:42:39.379127: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0
2021-01-03 13:42:39.386525: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 3411185000 Hz
2021-01-03 13:42:39.386867: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x4fb44c0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2021-01-03 13:42:39.386891: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2021-01-03 13:42:39.498097: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-01-03 13:42:39.498786: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x4fdf030 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2021-01-03 13:42:39.498814: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): GeForce GTX 1080, Compute Capability 6.1
2021-01-03 13:42:39.498987: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-01-03 13:42:39.499416: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce GTX 1080 computeCapability: 6.1
coreClock: 1.8095GHz coreCount: 20 deviceMemorySize: 7.92GiB deviceMemoryBandwidth: 298.32GiB/s
2021-01-03 13:42:39.499448: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
2021-01-03 13:42:39.499483: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10
2021-01-03 13:42:39.499504: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10
2021-01-03 13:42:39.499523: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10
2021-01-03 13:42:39.499543: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10
2021-01-03 13:42:39.499562: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10
2021-01-03 13:42:39.499581: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7
2021-01-03 13:42:39.499643: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-01-03 13:42:39.500113: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-01-03 13:42:39.500730: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0
2021-01-03 13:42:39.500772: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
2021-01-03 13:42:39.915228: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1257] Device interconnect StreamExecutor with strength 1 edge matrix:
2021-01-03 13:42:39.915298: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1263]      0 
2021-01-03 13:42:39.915322: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1276] 0:   N 
2021-01-03 13:42:39.915568: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-01-03 13:42:39.916104: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-01-03 13:42:39.916555: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1402] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 6668 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1080, pci bus id: 0000:01:00.0, compute capability: 6.1)
Running
2021-01-03 13:42:40.267699: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10

This is the GPU stats

+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.66       Driver Version: 450.66       CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  GeForce GTX 1080    Off  | 00000000:01:00.0  On |                  N/A |
|  0%   53C    P2    46W / 198W |   7718MiB /  8111MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+

As you can see, TensorFlow does not compute on the GPU but reserves the memory so I'm assuming it's because the inputs of the neural networks are too small and it uses the CPU instead.

To make sure the GPU was installed properly, I ran a sample from their documentation and it uses the GPU. You can also reproduce the problem on google colab here

Is it an issue with the algorithm or the code?

Is there a way to utilize the GPU in this case?

Update

It turns out that the agent learns to fly instead of learning to land so I added a maximum step of 150 to limit the episode time but it's still very slow.

from Simple DQN to slow to train

Hemant Vishwakarma

Monday, 25 January 2021

Simple DQN to slow to train

No comments:

Post a Comment