Source code for gym_loop.loops.default_loop

from tensorboardX import SummaryWriter
import logging
from gym import wrappers
import numpy as np

from .base_loop import BaseLoop

logging.basicConfig(level=logging.INFO)


[docs]class DefaultLoop(BaseLoop):
[docs] @staticmethod def get_default_parameters(): """Get default parameter dictionary for the loop""" return { "random_episodes": 0, "max_episode_len": 5000, "record": False, "record_dir": "./episode_records", "logdir": "./logs", "render": False, "max_episodes": 5, "episodes_per_checkpoint": 10, "time_per_checkpoint": None, "eval_logdir": "./eval_logs", "eval_episodes": 10, "eval_render": False, "eval_record": False, "eval_record_dir": "./eval_records", }
def __init__(self, agent, env, **params): super().__init__(**params) self.writer = SummaryWriter(logdir=self.logdir) self.eval_writer = SummaryWriter(logdir=self.logdir) self.env = env self.agent = agent self.global_step = 0 def __del__(self): self.writer.close()
[docs] def train(self): """Training loop""" # random loop logging.info("Running random episodes {} times".format(self.random_episodes)) for i in range(self.random_episodes): ob = self.env.reset() for _ in range(self.max_episode_len): ob, reward, done = self._step_random(ob, episode_num=i) if done: break if self.record: logging.info( f"Using Gym monitor to save videos, render self.environment flag {self.render}" ) self.env = wrappers.Monitor(self.env, directory=self.record_dir, force=True) # policy loop global_step = 0 for i in range(self.random_episodes, self.max_episodes): ob = self.env.reset() reward_per_ep = 0 for ep_step in range(self.max_episode_len): if self.render: self.env.render() global_step += 1 ob, reward, done = self._step_policy(ob) reward_per_ep += reward if done: break metrics = self.agent.metrics(i) if metrics is not None: for name, value in metrics.items(): if np.isnan(value): logging.warn("{} has nan".format(name)) self.writer.add_scalar(name, value, global_step=i) self.writer.add_scalar("reward", reward_per_ep, global_step=i) self.writer.add_scalar("avg_legth", ep_step, global_step=i) if i % self.episodes_per_checkpoint == 0 and i != 0: self.agent.save("checkpoint_{}".format(i))
[docs] def evaluate(self): if self.eval_record: logging.info( f"Using Gym monitor to save videos, render self.environment flag {self.eval_render}" ) self.env = wrappers.Monitor( self.env, directory=self.eval_record_dir, force=True ) for i in range(self.eval_episodes): ob = self.env.reset() reward_per_ep = 0 for ep_step in range(self.max_episode_len): if self.render: self.env.render() ob, reward, done = self._step_policy(ob, update=False) reward_per_ep += reward if done: break metrics = self.agent.metrics(i) if metrics is not None: for name, value in metrics.items(): self.eval_writer.add_scalar(name, value, global_step=i) self.eval_writer.add_scalar("reward", reward_per_ep, global_step=i) self.eval_writer.add_scalar("avg_legth", ep_step, global_step=i)
def _step_random(self, last_ob): action = self.env.action_space.sample() ob, reward, done, _ = self.env.step(action) self.agent.memorize(last_ob, action, reward, done, ob) return ob, reward, done def _step_policy(self, last_ob, update=True): self.global_step += 1 state = last_ob action = self.agent.act(state, self.global_step) ob, reward, done, _ = self.env.step(action) if update: self.agent.memorize(last_ob, action, reward, done, ob, self.global_step) self.agent.update(self.global_step) return ob, reward, done