merged commits

2024-09-25 07:57:26 -07:00
parent 88095e7899
commit 603b67ce66
12 changed files with 406 additions and 268 deletions
--- a/docker/environment.yaml
+++ b/docker/environment.yaml
@@ -1,56 +1,46 @@
-name: tdmpc2
+name: graph
 channels:
  - pytorch-nightly
  - nvidia
  - conda-forge
  - defaults
 dependencies:
-  - cudatoolkit=11.7
-  - glew=2.1.0
-  - glib=2.68.4
-  - pip=21.0
-  - python=3.9.0
-  - pytorch>=2.2.2
-  - torchvision>=0.16.2
+  - glew=2.2.0
+  - glib=2.78.4
+  - pip=24.0
+  - python=3.9
+  - pytorch
+  - pytorch-cuda=12.4
+  - torchvision
  - pip:
-    - absl-py==2.0.0
-    - "cython<3"
+    - absl-py==2.1.0
    - dm-control==1.0.8
+    - glfw==2.7.0
    - ffmpeg==1.4
-    - glfw==2.6.4
+    - imageio==2.34.1
+    - imageio-ffmpeg==0.4.9
+    - h5py==3.11.0
    - hydra-core==1.3.2
    - hydra-submitit-launcher==1.2.0
-    - imageio==2.33.1
-    - imageio-ffmpeg==0.4.9
-    - kornia==0.7.1
+    - submitit==1.5.1
+    - omegaconf==2.3.0
    - moviepy==1.0.3
    - mujoco==2.3.1
-    - mujoco-py==2.1.2.14
-    - numpy==1.23.5
-    - omegaconf==2.3.0
-    - open3d==0.18.0
-    - opencv-contrib-python==4.9.0.80
-    - opencv-python==4.9.0.80
-    - pandas==2.1.4
-    - sapien==2.2.1
-    - submitit==1.5.1
-    - setuptools==65.5.0
-    - patchelf==0.17.2.1
-    - protobuf==4.25.2
-    - pillow==10.2.0
-    - pyquaternion==0.9.9
-    - tensordict-nightly==2024.3.26
+    - numpy==1.24.4
+    - tensordict-nightly
+    - torchrl-nightly
+    - kornia==0.7.2
    - termcolor==2.4.0
-    - torchrl-nightly==2024.3.26
-    - transforms3d==0.4.1
-    - trimesh==4.0.9
-    - tqdm==4.66.1
-    - wandb==0.16.2
-    - wheel==0.38.0
+    - tqdm==4.66.4
+    - pandas==2.0.3
+    - wandb==0.17.4
+    - matplotlib==3.7.5
+    - seaborn==0.13.2
+    - gpustat==1.1.1
    ####################
    # Gym:
    # (unmaintained but required for maniskill2/meta-world/myosuite)
-    # - gym==0.21.0
+    - gym==0.21.0
    ####################
    # ManiSkill2:
    # (requires gym==0.21.0 which occasionally breaks)
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,34 @@
+absl-py
+cython
+dm-control
+ffmpeg
+glfw
+hydra-core
+hydra-submitit-launcher
+imageio
+imageio-ffmpeg
+kornia
+moviepy
+mujoco
+mujoco-py
+numpy<2
+omegaconf
+open3d
+opencv-contrib-python
+opencv-python
+pandas
+sapien
+submitit
+setuptools
+patchelf
+protobuf
+pillow
+pyquaternion
+tensordict-nightly
+termcolor
+torchrl-nightly
+transforms3d
+trimesh
+tqdm
+wandb
+wheel
--- a/tdmpc2/common/buffer.py
+++ b/tdmpc2/common/buffer.py
@@ -12,7 +12,7 @@ class Buffer():

 	def __init__(self, cfg):
 		self.cfg = cfg
-		self._device = torch.device('cuda')
+		self._device = torch.device('cuda:0')
 		self._capacity = min(cfg.buffer_size, cfg.steps)
 		self._sampler = SliceSampler(
 			num_slices=self.cfg.batch_size,
@@ -28,7 +28,7 @@ class Buffer():
 	def capacity(self):
 		"""Return the capacity of the buffer."""
 		return self._capacity
-	
+
 	@property
 	def num_eps(self):
 		"""Return the number of episodes in the buffer."""
@@ -41,8 +41,8 @@ class Buffer():
 		return ReplayBuffer(
 			storage=storage,
 			sampler=self._sampler,
-			pin_memory=True,
-			prefetch=1,
+			pin_memory=False,
+			prefetch=0,
 			batch_size=self._batch_size,
 		)

@@ -58,32 +58,30 @@ class Buffer():
 		total_bytes = bytes_per_step*self._capacity
 		print(f'Storage required: {total_bytes/1e9:.2f} GB')
 		# Heuristic: decide whether to use CUDA or CPU memory
-		storage_device = 'cuda' if 2.5*total_bytes < mem_free else 'cpu'
+		storage_device = 'cuda:0' if 2.5*total_bytes < mem_free else 'cpu'
 		print(f'Using {storage_device.upper()} memory for storage.')
+		self._storage_device = torch.device(storage_device)
 		return self._reserve_buffer(
-			LazyTensorStorage(self._capacity, device=torch.device(storage_device))
+			LazyTensorStorage(self._capacity, device=self._storage_device)
 		)

-	def _to_device(self, *args, device=None):
-		if device is None:
-			device = self._device
-		return (arg.to(device, non_blocking=True) \
-			if arg is not None else None for arg in args)
-
 	def _prepare_batch(self, td):
 		"""
 		Prepare a sampled batch for training (post-processing).
 		Expects `td` to be a TensorDict with batch size TxB.
 		"""
-		obs = td['obs']
-		action = td['action'][1:]
-		reward = td['reward'][1:].unsqueeze(-1)
-		task = td['task'][0] if 'task' in td.keys() else None
-		return self._to_device(obs, action, reward, task)
+		td = td.select("obs", "action", "reward", "task", strict=False).to(self._device, non_blocking=True)
+		obs = td.get('obs').contiguous()
+		action = td.get('action')[1:].contiguous()
+		reward = td.get('reward')[1:].unsqueeze(-1).contiguous()
+		task = td.get('task', None)
+		if task is not None:
+			task = task[0].contiguous()
+		return obs, action, reward, task

 	def add(self, td):
 		"""Add an episode to the buffer."""
-		td['episode'] = torch.ones_like(td['reward'], dtype=torch.int64) * self._num_eps
+		td['episode'] = torch.full_like(td['reward'], self._num_eps, dtype=torch.int64)
 		if self._num_eps == 0:
 			self._buffer = self._init(td)
 		self._buffer.extend(td)
--- a/tdmpc2/common/layers.py
+++ b/tdmpc2/common/layers.py
@@ -1,8 +1,8 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from functorch import combine_state_for_ensemble
-
+from tensordict import from_modules
+from copy import deepcopy

 class Ensemble(nn.Module):
 	"""
@@ -11,14 +11,18 @@ class Ensemble(nn.Module):

 	def __init__(self, modules, **kwargs):
 		super().__init__()
-		modules = nn.ModuleList(modules)
-		fn, params, _ = combine_state_for_ensemble(modules)
-		self.vmap = torch.vmap(fn, in_dims=(0, 0, None), randomness='different', **kwargs)
-		self.params = nn.ParameterList([nn.Parameter(p) for p in params])
+		# combine_state_for_ensemble causes graph breaks
+		self.params = from_modules(*modules, as_module=True)
+		with self.params[0].data.to("meta").to_module(modules[0]):
+			self.module = deepcopy(modules[0])
 		self._repr = str(modules)

+	def _call(self, params, *args, **kwargs):
+		with params.to_module(self.module):
+			return self.module(*args, **kwargs)
+
 	def forward(self, *args, **kwargs):
-		return self.vmap([p for p in self.params], (), *args, **kwargs)
+		return torch.vmap(self._call, (0, None), randomness="different")(self.params, *args, **kwargs)

 	def __repr__(self):
 		return 'Vectorized ' + self._repr
@@ -32,13 +36,13 @@ class ShiftAug(nn.Module):
 	def __init__(self, pad=3):
 		super().__init__()
 		self.pad = pad
+		self.padding = tuple([self.pad] * 4)

 	def forward(self, x):
 		x = x.float()
 		n, _, h, w = x.size()
 		assert h == w
-		padding = tuple([self.pad] * 4)
-		x = F.pad(x, padding, 'replicate')
+		x = F.pad(x, self.padding, 'replicate')
 		eps = 1.0 / (h + 2 * self.pad)
 		arange = torch.linspace(-1.0 + eps, 1.0 - eps, h + 2 * self.pad, device=x.device, dtype=x.dtype)[:h]
 		arange = arange.unsqueeze(0).repeat(h, 1).unsqueeze(2)
@@ -59,7 +63,7 @@ class PixelPreprocess(nn.Module):
 		super().__init__()

 	def forward(self, x):
-		return x.div_(255.).sub_(0.5)
+		return x.div(255.).sub(0.5)


 class SimNorm(nn.Module):
@@ -67,17 +71,17 @@ class SimNorm(nn.Module):
 	Simplicial normalization.
 	Adapted from https://arxiv.org/abs/2204.00616.
 	"""
-	
+
 	def __init__(self, cfg):
 		super().__init__()
 		self.dim = cfg.simnorm_dim
-	
+
 	def forward(self, x):
 		shp = x.shape
 		x = x.view(*shp[:-1], -1, self.dim)
 		x = F.softmax(x, dim=-1)
 		return x.view(*shp)
-		
+
 	def __repr__(self):
 		return f"SimNorm(dim={self.dim})"

@@ -87,18 +91,20 @@ class NormedLinear(nn.Linear):
 	Linear layer with LayerNorm, activation, and optionally dropout.
 	"""

-	def __init__(self, *args, dropout=0., act=nn.Mish(inplace=True), **kwargs):
+	def __init__(self, *args, dropout=0., act=None, **kwargs):
 		super().__init__(*args, **kwargs)
 		self.ln = nn.LayerNorm(self.out_features)
+		if act is None:
+			act = nn.Mish(inplace=False)
 		self.act = act
-		self.dropout = nn.Dropout(dropout, inplace=True) if dropout else None
+		self.dropout = nn.Dropout(dropout, inplace=False) if dropout else None

 	def forward(self, x):
 		x = super().forward(x)
 		if self.dropout:
 			x = self.dropout(x)
 		return self.act(self.ln(x))
-	
+
 	def __repr__(self):
 		repr_dropout = f", dropout={self.dropout.p}" if self.dropout else ""
 		return f"NormedLinear(in_features={self.in_features}, "\
@@ -130,9 +136,9 @@ def conv(in_shape, num_channels, act=None):
 	assert in_shape[-1] == 64 # assumes rgb observations to be 64x64
 	layers = [
 		ShiftAug(), PixelPreprocess(),
-		nn.Conv2d(in_shape[0], num_channels, 7, stride=2), nn.ReLU(inplace=True),
-		nn.Conv2d(num_channels, num_channels, 5, stride=2), nn.ReLU(inplace=True),
-		nn.Conv2d(num_channels, num_channels, 3, stride=2), nn.ReLU(inplace=True),
+		nn.Conv2d(in_shape[0], num_channels, 7, stride=2), nn.ReLU(inplace=False),
+		nn.Conv2d(num_channels, num_channels, 5, stride=2), nn.ReLU(inplace=False),
+		nn.Conv2d(num_channels, num_channels, 3, stride=2), nn.ReLU(inplace=False),
 		nn.Conv2d(num_channels, num_channels, 3, stride=1), nn.Flatten()]
 	if act:
 		layers.append(act)
--- a/tdmpc2/common/logger.py
+++ b/tdmpc2/common/logger.py
@@ -1,11 +1,11 @@
+import dataclasses
 import os
 import datetime
 import re
 import numpy as np
 import pandas as pd
 from termcolor import colored
-from omegaconf import OmegaConf
-
+from torchrl._utils import timeit
 from common import TASK_SET


@@ -133,7 +133,7 @@ class Logger:
 			group=self._group,
 			tags=cfg_to_group(cfg, return_list=True) + [f"seed:{cfg.seed}"],
 			dir=self._log_dir,
-			config=OmegaConf.to_container(cfg, resolve=True),
+			config=dataclasses.asdict(cfg),
 		)
 		print(colored("Logs will be synced with wandb.", "blue", attrs=["bold"]))
 		self._wandb = wandb
@@ -238,3 +238,5 @@ class Logger:
 				self._log_dir / "eval.csv", header=keys, index=None
 			)
 		self._print(d, category)
+		timeit.print()
+		timeit.erase()
--- a/tdmpc2/common/math.py
+++ b/tdmpc2/common/math.py
@@ -9,30 +9,30 @@ def soft_ce(pred, target, cfg):
 	return -(target * pred).sum(-1, keepdim=True)


-@torch.jit.script
+
 def log_std(x, low, dif):
 	return low + 0.5 * dif * (torch.tanh(x) + 1)


-@torch.jit.script
+
 def _gaussian_residual(eps, log_std):
 	return -0.5 * eps.pow(2) - log_std


-@torch.jit.script
 def _gaussian_logprob(residual):
-	return residual - 0.5 * torch.log(2 * torch.pi)
+	log2pi = 1.8378770351409912
+	return residual - 0.5 * log2pi


 def gaussian_logprob(eps, log_std, size=None):
 	"""Compute Gaussian log probability."""
 	residual = _gaussian_residual(eps, log_std).sum(-1, keepdim=True)
 	if size is None:
-		size = eps.size(-1)
+		size = eps.shape[-1]
 	return _gaussian_logprob(residual) * size


-@torch.jit.script
+
 def _squash(pi):
 	return torch.log(F.relu(1 - pi.pow(2)) + 1e-6)

@@ -45,7 +45,7 @@ def squash(mu, pi, log_pi):
 	return mu, pi, log_pi


-@torch.jit.script
+
 def symlog(x):
 	"""
 	Symmetric logarithmic function.
@@ -54,7 +54,7 @@ def symlog(x):
 	return torch.sign(x) * torch.log(1 + torch.abs(x))


-@torch.jit.script
+
 def symexp(x):
 	"""
 	Symmetric exponential function.
@@ -70,26 +70,32 @@ def two_hot(x, cfg):
 	elif cfg.num_bins == 1:
 		return symlog(x)
 	x = torch.clamp(symlog(x), cfg.vmin, cfg.vmax).squeeze(1)
-	bin_idx = torch.floor((x - cfg.vmin) / cfg.bin_size).long()
-	bin_offset = ((x - cfg.vmin) / cfg.bin_size - bin_idx.float()).unsqueeze(-1)
-	soft_two_hot = torch.zeros(x.size(0), cfg.num_bins, device=x.device)
-	soft_two_hot.scatter_(1, bin_idx.unsqueeze(1), 1 - bin_offset)
-	soft_two_hot.scatter_(1, (bin_idx.unsqueeze(1) + 1) % cfg.num_bins, bin_offset)
+	bin_idx = torch.floor((x - cfg.vmin) / cfg.bin_size)
+	bin_offset = ((x - cfg.vmin) / cfg.bin_size - bin_idx).unsqueeze(-1)
+	soft_two_hot = torch.zeros(x.shape[0], cfg.num_bins, device=x.device, dtype=x.dtype)
+	bin_idx = bin_idx.long()
+	soft_two_hot = soft_two_hot.scatter(1, bin_idx.unsqueeze(1), 1 - bin_offset)
+	soft_two_hot = soft_two_hot.scatter(1, (bin_idx.unsqueeze(1) + 1) % cfg.num_bins, bin_offset)
 	return soft_two_hot


-DREG_BINS = None
-
-
 def two_hot_inv(x, cfg):
 	"""Converts a batch of soft two-hot encoded vectors to scalars."""
-	global DREG_BINS
 	if cfg.num_bins == 0:
 		return x
 	elif cfg.num_bins == 1:
 		return symexp(x)
-	if DREG_BINS is None:
-		DREG_BINS = torch.linspace(cfg.vmin, cfg.vmax, cfg.num_bins, device=x.device)
+	dreg_bins = torch.linspace(cfg.vmin, cfg.vmax, cfg.num_bins, device=x.device, dtype=x.dtype)
 	x = F.softmax(x, dim=-1)
-	x = torch.sum(x * DREG_BINS, dim=-1, keepdim=True)
+	x = torch.sum(x * dreg_bins, dim=-1, keepdim=True)
 	return symexp(x)
+
+def gumbel_softmax_sample(p, temperature=1.0, dim=0):
+	logits = p.log()
+	# Generate Gumbel noise
+	gumbels = (
+		-torch.empty_like(logits, memory_format=torch.legacy_contiguous_format).exponential_().log()
+	)  # ~Gumbel(0,1)
+	gumbels = (logits + gumbels) / temperature  # ~Gumbel(logits,tau)
+	y_soft = gumbels.softmax(dim)
+	return y_soft.argmax(-1)
--- a/tdmpc2/common/scale.py
+++ b/tdmpc2/common/scale.py
@@ -1,48 +1,53 @@
 import torch
+from torch.nn import Buffer

-
-class RunningScale:
+class RunningScale(torch.nn.Module):
 	"""Running trimmed scale estimator."""

 	def __init__(self, cfg):
+		super().__init__()
 		self.cfg = cfg
-		self._value = torch.ones(1, dtype=torch.float32, device=torch.device('cuda'))
-		self._percentiles = torch.tensor([5, 95], dtype=torch.float32, device=torch.device('cuda'))
+		self._value = Buffer(torch.ones(1, dtype=torch.float32, device=torch.device('cuda')))
+		self._percentiles = Buffer(torch.tensor([5, 95], dtype=torch.float32, device=torch.device('cuda')))

 	def state_dict(self):
 		return dict(value=self._value, percentiles=self._percentiles)

 	def load_state_dict(self, state_dict):
-		self._value.data.copy_(state_dict['value'])
-		self._percentiles.data.copy_(state_dict['percentiles'])
+		self._value.copy_(state_dict['value'])
+		self._percentiles.copy_(state_dict['percentiles'])

 	@property
 	def value(self):
-		return self._value.cpu().item()
+		return self._value
+
+	def _positions(self, x_shape):
+		positions = self._percentiles * (x_shape-1) / 100
+		floored = torch.floor(positions)
+		ceiled = floored + 1
+		ceiled = torch.where(ceiled > x_shape - 1, x_shape - 1, ceiled)
+		weight_ceiled = positions-floored
+		weight_floored = 1.0 - weight_ceiled
+		return floored.long(), ceiled.long(), weight_floored.unsqueeze(1), weight_ceiled.unsqueeze(1)

 	def _percentile(self, x):
 		x_dtype, x_shape = x.dtype, x.shape
-		x = x.view(x.shape[0], -1)
-		in_sorted, _ = torch.sort(x, dim=0)
-		positions = self._percentiles * (x.shape[0]-1) / 100
-		floored = torch.floor(positions)
-		ceiled = floored + 1
-		ceiled[ceiled > x.shape[0] - 1] = x.shape[0] - 1
-		weight_ceiled = positions-floored
-		weight_floored = 1.0 - weight_ceiled
-		d0 = in_sorted[floored.long(), :] * weight_floored[:, None]
-		d1 = in_sorted[ceiled.long(), :] * weight_ceiled[:, None]
-		return (d0+d1).view(-1, *x_shape[1:]).type(x_dtype)
+		x = x.flatten(1, x.ndim-1)
+		in_sorted = torch.sort(x, dim=0).values
+		floored, ceiled, weight_floored, weight_ceiled = self._positions(x.shape[0])
+		d0 = in_sorted[floored] * weight_floored
+		d1 = in_sorted[ceiled] * weight_ceiled
+		return (d0+d1).reshape(-1, *x_shape[1:]).to(x_dtype)

 	def update(self, x):
 		percentiles = self._percentile(x.detach())
 		value = torch.clamp(percentiles[1] - percentiles[0], min=1.)
-		self._value.data.lerp_(value, self.cfg.tau)
+		self._value.lerp_(value, self.cfg.tau)

-	def __call__(self, x, update=False):
+	def forward(self, x, update=False):
 		if update:
 			self.update(x)
-		return x * (1/self.value)
+		return x / self.value

 	def __repr__(self):
 		return f'RunningScale(S: {self.value})'
--- a/tdmpc2/common/world_model.py
+++ b/tdmpc2/common/world_model.py
@@ -5,7 +5,8 @@ import torch
 import torch.nn as nn

 from common import layers, math, init
-
+from tensordict import TensorDict
+from tensordict.nn import TensorDictParams

 class WorldModel(nn.Module):
 	"""
@@ -18,7 +19,7 @@ class WorldModel(nn.Module):
 		self.cfg = cfg
 		if cfg.multitask:
 			self._task_emb = nn.Embedding(len(cfg.tasks), cfg.task_dim, max_norm=1)
-			self._action_masks = torch.zeros(len(cfg.tasks), cfg.action_dim)
+			self.register_buffer("_action_masks", torch.zeros(len(cfg.tasks), cfg.action_dim))
 			for i in range(len(cfg.tasks)):
 				self._action_masks[i, :cfg.action_dims[i]] = 1.
 		self._encoder = layers.enc(cfg)
@@ -27,26 +28,35 @@ class WorldModel(nn.Module):
 		self._pi = layers.mlp(cfg.latent_dim + cfg.task_dim, 2*[cfg.mlp_dim], 2*cfg.action_dim)
 		self._Qs = layers.Ensemble([layers.mlp(cfg.latent_dim + cfg.action_dim + cfg.task_dim, 2*[cfg.mlp_dim], max(cfg.num_bins, 1), dropout=cfg.dropout) for _ in range(cfg.num_q)])
 		self.apply(init.weight_init)
-		init.zero_([self._reward[-1].weight, self._Qs.params[-2]])
-		self._target_Qs = deepcopy(self._Qs).requires_grad_(False)
-		self.log_std_min = torch.tensor(cfg.log_std_min)
-		self.log_std_dif = torch.tensor(cfg.log_std_max) - self.log_std_min
+		init.zero_([self._reward[-1].weight, self._Qs.params["2", "weight"]])
+
+		self.register_buffer("log_std_min", torch.tensor(cfg.log_std_min))
+		self.register_buffer("log_std_dif", torch.tensor(cfg.log_std_max) - self.log_std_min)
+		self.init()
+
+	def init(self):
+		# Create params
+		self._detach_Qs_params = TensorDictParams(self._Qs.params.data, no_convert=True)
+		self._target_Qs_params = TensorDictParams(self._Qs.params.data.clone(), no_convert=True)
+
+		# Create modules
+		with self._detach_Qs_params.data.to("meta").to_module(self._Qs.module):
+			self._detach_Qs = deepcopy(self._Qs)
+			self._target_Qs = deepcopy(self._Qs)
+
+		# Assign params to modules
+		self._detach_Qs.params = self._detach_Qs_params
+		self._target_Qs.params = self._target_Qs_params

 	@property
 	def total_params(self):
 		return sum(p.numel() for p in self.parameters() if p.requires_grad)
-		
+
 	def to(self, *args, **kwargs):
-		"""
-		Overriding `to` method to also move additional tensors to device.
-		"""
 		super().to(*args, **kwargs)
-		if self.cfg.multitask:
-			self._action_masks = self._action_masks.to(*args, **kwargs)
-		self.log_std_min = self.log_std_min.to(*args, **kwargs)
-		self.log_std_dif = self.log_std_dif.to(*args, **kwargs)
+		self.init()
 		return self
-	
+
 	def train(self, mode=True):
 		"""
 		Overriding `train` method to keep target Q-networks in eval mode.
@@ -55,26 +65,12 @@ class WorldModel(nn.Module):
 		self._target_Qs.train(False)
 		return self

-	def track_q_grad(self, mode=True):
-		"""
-		Enables/disables gradient tracking of Q-networks.
-		Avoids unnecessary computation during policy optimization.
-		This method also enables/disables gradients for task embeddings.
-		"""
-		for p in self._Qs.parameters():
-			p.requires_grad_(mode)
-		if self.cfg.multitask:
-			for p in self._task_emb.parameters():
-				p.requires_grad_(mode)
-
 	def soft_update_target_Q(self):
 		"""
 		Soft-update target Q-networks using Polyak averaging.
 		"""
-		with torch.no_grad():
-			for p, p_target in zip(self._Qs.parameters(), self._target_Qs.parameters()):
-				p_target.data.lerp_(p.data, self.cfg.tau)
-	
+		self._target_Qs_params.lerp_(self._detach_Qs_params, self.cfg.tau)
+
 	def task_emb(self, x, task):
 		"""
 		Continuous task embedding for multi-task experiments.
@@ -109,7 +105,7 @@ class WorldModel(nn.Module):
 			z = self.task_emb(z, task)
 		z = torch.cat([z, a], dim=-1)
 		return self._dynamics(z)
-	
+
 	def reward(self, z, a, task):
 		"""
 		Predicts instantaneous (single-step) reward.
@@ -147,7 +143,7 @@ class WorldModel(nn.Module):

 		return mu, pi, log_pi, log_std

-	def Q(self, z, a, task, return_type='min', target=False):
+	def Q(self, z, a, task, return_type='min', target=False, detach=False):
 		"""
 		Predict state-action value.
 		`return_type` can be one of [`min`, `avg`, `all`]:
@@ -160,13 +156,21 @@ class WorldModel(nn.Module):

 		if self.cfg.multitask:
 			z = self.task_emb(z, task)
-			
+
 		z = torch.cat([z, a], dim=-1)
-		out = (self._target_Qs if target else self._Qs)(z)
+		if target:
+			qnet = self._target_Qs
+		elif detach:
+			qnet = self._detach_Qs
+		else:
+			qnet = self._Qs
+		out = qnet(z)

 		if return_type == 'all':
 			return out

-		Q1, Q2 = out[np.random.choice(self.cfg.num_q, 2, replace=False)]
-		Q1, Q2 = math.two_hot_inv(Q1, self.cfg), math.two_hot_inv(Q2, self.cfg)
-		return torch.min(Q1, Q2) if return_type == 'min' else (Q1 + Q2) / 2
+		qidx = torch.randperm(self.cfg.num_q, device=out.device)[:2]
+		Q = math.two_hot_inv(out[qidx], self.cfg)
+		if return_type == "min":
+			return Q.min(0).values
+		return Q.sum(0) / 2
--- a/tdmpc2/config.yaml
+++ b/tdmpc2/config.yaml
@@ -86,3 +86,7 @@ action_dims: ???
 episode_lengths: ???
 seed_steps: ???
 bin_size: ???
+
+# compile
+compile: False
+cudagraphs: False
--- a/tdmpc2/tdmpc2.py
+++ b/tdmpc2/tdmpc2.py
@@ -1,13 +1,18 @@
-import numpy as np
 import torch
 import torch.nn.functional as F
+import functools
+from torchrl._utils import timeit

 from common import math
 from common.scale import RunningScale
 from common.world_model import WorldModel
+from tensordict.nn import CudaGraphModule
+from tensordict import TensorDict
+
+CG_WARMUP = 1000


-class TDMPC2:
+class TDMPC2(torch.nn.Module):
 	"""
 	TD-MPC2 agent. Implements training + inference.
 	Can be used for both single-task and multi-task experiments,
@@ -15,23 +20,71 @@ class TDMPC2:
 	"""

 	def __init__(self, cfg):
+		super().__init__()
 		self.cfg = cfg
-		self.device = torch.device('cuda')
+
+		self.device = torch.device('cuda:0')
+
 		self.model = WorldModel(cfg).to(self.device)
+		capturable = True
 		self.optim = torch.optim.Adam([
 			{'params': self.model._encoder.parameters(), 'lr': self.cfg.lr*self.cfg.enc_lr_scale},
 			{'params': self.model._dynamics.parameters()},
 			{'params': self.model._reward.parameters()},
 			{'params': self.model._Qs.parameters()},
-			{'params': self.model._task_emb.parameters() if self.cfg.multitask else []}
-		], lr=self.cfg.lr)
-		self.pi_optim = torch.optim.Adam(self.model._pi.parameters(), lr=self.cfg.lr, eps=1e-5)
+			{'params': self.model._task_emb.parameters() if self.cfg.multitask else []
+			 }
+		], lr=self.cfg.lr, capturable=capturable)
+		self.pi_optim = torch.optim.Adam(self.model._pi.parameters(), lr=self.cfg.lr, eps=1e-5, capturable=capturable)
 		self.model.eval()
 		self.scale = RunningScale(cfg)
 		self.cfg.iterations += 2*int(cfg.action_dim >= 20) # Heuristic for large action spaces
 		self.discount = torch.tensor(
-			[self._get_discount(ep_len) for ep_len in cfg.episode_lengths], device='cuda'
+			[self._get_discount(ep_len) for ep_len in cfg.episode_lengths], device='cuda:0'
 		) if self.cfg.multitask else self._get_discount(cfg.episode_length)
+		self._prev_mean = torch.nn.Buffer(torch.zeros(self.cfg.horizon, self.cfg.action_dim, device=self.device))
+
+		if cfg.compile:
+			mode = None if cfg.cudagraphs else "reduce-overhead"
+			print('compiling - update')
+			self._update = torch.compile(self._update, mode=mode)
+
+		if cfg.cudagraphs:
+			print('cudagraphs - update')
+			self._update = CudaGraphModule(self._update, warmup=CG_WARMUP)
+
+	@property
+	def plan(self):
+		_plan_val = getattr(self, "_plan_val", None)
+		if _plan_val is not None:
+			return _plan_val
+		if self.cfg.cudagraphs:
+			print('cudagraphs - plan')
+			self._plan_dict = {
+				(True, True): functools.partial(self._plan, t0=True, eval_mode=True),
+				(False, True): functools.partial(self._plan, t0=False, eval_mode=True),
+				(True, False): functools.partial(self._plan, t0=True, eval_mode=False),
+				(False, False): functools.partial(self._plan, t0=False, eval_mode=False),
+			}
+			if self.cfg.compile:
+				print('compiling - plan')
+				mode = None
+				self._plan_dict = {k: torch.compile(func, mode=mode) for k, func in self._plan_dict.items()}
+
+			self._plan_dict = {k: CudaGraphModule(func, warmup=CG_WARMUP) for k, func in self._plan_dict.items()}
+			def plan(obs, t0=False, eval_mode=False, task=None):
+				if task is not None:
+					kwargs = {"task": task}
+				else:
+					kwargs = {}
+				torch.compiler.cudagraph_mark_step_begin()
+				return self._plan_dict[(t0, eval_mode)](obs=obs, **kwargs)
+		elif self.cfg.compile:
+			plan = torch.compile(self._plan, mode="reduce-overhead")
+		else:
+			plan = self._plan
+		self._plan_val = plan
+		return self._plan_val

 	def _get_discount(self, episode_length):
 		"""
@@ -51,7 +104,7 @@ class TDMPC2:
 	def save(self, fp):
 		"""
 		Save state dict of the agent to filepath.
-		
+
 		Args:
 			fp (str): Filepath to save state dict to.
 		"""
@@ -60,7 +113,7 @@ class TDMPC2:
 	def load(self, fp):
 		"""
 		Load a saved state dict from filepath (or dictionary) into current agent.
-		
+
 		Args:
 			fp (str or dict): Filepath or state dict to load.
 		"""
@@ -71,23 +124,23 @@ class TDMPC2:
 	def act(self, obs, t0=False, eval_mode=False, task=None):
 		"""
 		Select an action by planning in the latent space of the world model.
-		
+
 		Args:
 			obs (torch.Tensor): Observation from the environment.
 			t0 (bool): Whether this is the first observation in the episode.
 			eval_mode (bool): Whether to use the mean of the action distribution.
 			task (int): Task index (only used for multi-task experiments).
-		
+
 		Returns:
 			torch.Tensor: Action to take in the environment.
 		"""
 		obs = obs.to(self.device, non_blocking=True).unsqueeze(0)
 		if task is not None:
 			task = torch.tensor([task], device=self.device)
-		z = self.model.encode(obs, task)
 		if self.cfg.mpc:
-			a = self.plan(z, t0=t0, eval_mode=eval_mode, task=task)
+			a = self.plan(obs, t0=t0, eval_mode=eval_mode, task=task)
 		else:
+			z = self.model.encode(obs, task)
 			a = self.model.pi(z, task)[int(not eval_mode)][0]
 		return a.cpu()

@@ -98,15 +151,16 @@ class TDMPC2:
 		for t in range(self.cfg.horizon):
 			reward = math.two_hot_inv(self.model.reward(z, actions[t], task), self.cfg)
 			z = self.model.next(z, actions[t], task)
-			G += discount * reward
-			discount *= self.discount[torch.tensor(task)] if self.cfg.multitask else self.discount
+			G = G + discount * reward
+			discount_update = self.discount[torch.tensor(task)] if self.cfg.multitask else self.discount
+			discount = discount * discount_update
 		return G + discount * self.model.Q(z, self.model.pi(z, task)[1], task, return_type='avg')

 	@torch.no_grad()
-	def plan(self, z, t0=False, eval_mode=False, task=None):
+	def _plan(self, obs, t0=False, eval_mode=False, task=None):
 		"""
 		Plan a sequence of actions using the learned world model.
-		
+
 		Args:
 			z (torch.Tensor): Latent state from which to plan.
 			t0 (bool): Whether this is the first observation in the episode.
@@ -115,8 +169,9 @@ class TDMPC2:

 		Returns:
 			torch.Tensor: Action to take in the environment.
-		"""		
+		"""
 		# Sample policy trajectories
+		z = self.model.encode(obs, task)
 		if self.cfg.num_pi_trajs > 0:
 			pi_actions = torch.empty(self.cfg.horizon, self.cfg.num_pi_trajs, self.cfg.action_dim, device=self.device)
 			_z = z.repeat(self.cfg.num_pi_trajs, 1)
@@ -128,52 +183,53 @@ class TDMPC2:
 		# Initialize state and parameters
 		z = z.repeat(self.cfg.num_samples, 1)
 		mean = torch.zeros(self.cfg.horizon, self.cfg.action_dim, device=self.device)
-		std = self.cfg.max_std*torch.ones(self.cfg.horizon, self.cfg.action_dim, device=self.device)
+		std = torch.full((self.cfg.horizon, self.cfg.action_dim), self.cfg.max_std, dtype=torch.float, device=self.device)
 		if not t0:
 			mean[:-1] = self._prev_mean[1:]
 		actions = torch.empty(self.cfg.horizon, self.cfg.num_samples, self.cfg.action_dim, device=self.device)
 		if self.cfg.num_pi_trajs > 0:
 			actions[:, :self.cfg.num_pi_trajs] = pi_actions
-	
+
 		# Iterate MPPI
 		for _ in range(self.cfg.iterations):

 			# Sample actions
-			actions[:, self.cfg.num_pi_trajs:] = (mean.unsqueeze(1) + std.unsqueeze(1) * \
-				torch.randn(self.cfg.horizon, self.cfg.num_samples-self.cfg.num_pi_trajs, self.cfg.action_dim, device=std.device)) \
-				.clamp(-1, 1)
+			r = torch.randn(self.cfg.horizon, self.cfg.num_samples-self.cfg.num_pi_trajs, self.cfg.action_dim, device=std.device)
+			actions_sample = mean.unsqueeze(1) + std.unsqueeze(1) * r
+			actions_sample = actions_sample.clamp(-1, 1)
+			actions[:, self.cfg.num_pi_trajs:] = actions_sample
 			if self.cfg.multitask:
 				actions = actions * self.model._action_masks[task]

 			# Compute elite actions
-			value = self._estimate_value(z, actions, task).nan_to_num_(0)
+			value = self._estimate_value(z, actions, task).nan_to_num(0)
 			elite_idxs = torch.topk(value.squeeze(1), self.cfg.num_elites, dim=0).indices
 			elite_value, elite_actions = value[elite_idxs], actions[:, elite_idxs]

 			# Update parameters
-			max_value = elite_value.max(0)[0]
+			max_value = elite_value.max(0).values
 			score = torch.exp(self.cfg.temperature*(elite_value - max_value))
-			score /= score.sum(0)
-			mean = torch.sum(score.unsqueeze(0) * elite_actions, dim=1) / (score.sum(0) + 1e-9)
-			std = torch.sqrt(torch.sum(score.unsqueeze(0) * (elite_actions - mean.unsqueeze(1)) ** 2, dim=1) / (score.sum(0) + 1e-9)) \
-				.clamp_(self.cfg.min_std, self.cfg.max_std)
+			score = score / score.sum(0)
+			mean = (score.unsqueeze(0) * elite_actions).sum(dim=1) / (score.sum(0) + 1e-9)
+			std = ((score.unsqueeze(0) * (elite_actions - mean.unsqueeze(1)) ** 2).sum(dim=1) / (score.sum(0) + 1e-9)).sqrt()
+			std = std.clamp(self.cfg.min_std, self.cfg.max_std)
 			if self.cfg.multitask:
 				mean = mean * self.model._action_masks[task]
 				std = std * self.model._action_masks[task]

 		# Select action
-		score = score.squeeze(1).cpu().numpy()
-		actions = elite_actions[:, np.random.choice(np.arange(score.shape[0]), p=score)]
-		self._prev_mean = mean
+		rand_idx = math.gumbel_softmax_sample(score.squeeze(1))  # gumbel_softmax_sample is compatible with cuda graphs
+		actions = torch.index_select(elite_actions, 1, rand_idx).squeeze(1)
 		a, std = actions[0], std[0]
 		if not eval_mode:
-			a += std * torch.randn(self.cfg.action_dim, device=std.device)
-		return a.clamp_(-1, 1)
-		
+			a = a + std * torch.randn(self.cfg.action_dim, device=std.device)
+		self._prev_mean.copy_(mean)
+		return a.clamp(-1, 1)
+
 	def update_pi(self, zs, task):
 		"""
 		Update policy using a sequence of latent states.
-		
+
 		Args:
 			zs (torch.Tensor): Sequence of latent states.
 			task (torch.Tensor): Task index (only used for multi-task experiments).
@@ -181,10 +237,8 @@ class TDMPC2:
 		Returns:
 			float: Loss of the policy update.
 		"""
-		self.pi_optim.zero_grad(set_to_none=True)
-		self.model.track_q_grad(False)
 		_, pis, log_pis, _ = self.model.pi(zs, task)
-		qs = self.model.Q(zs, pis, task, return_type='avg')
+		qs = self.model.Q(zs, pis, task, return_type='avg', detach=True)
 		self.scale.update(qs[0])
 		qs = self.scale(qs)

@@ -192,22 +246,23 @@ class TDMPC2:
 		rho = torch.pow(self.cfg.rho, torch.arange(len(qs), device=self.device))
 		pi_loss = ((self.cfg.entropy_coef * log_pis - qs).mean(dim=(1,2)) * rho).mean()
 		pi_loss.backward()
-		torch.nn.utils.clip_grad_norm_(self.model._pi.parameters(), self.cfg.grad_clip_norm)
+		pi_grad_norm = torch.nn.utils.clip_grad_norm_(self.model._pi.parameters(), self.cfg.grad_clip_norm)
 		self.pi_optim.step()
-		self.model.track_q_grad(True)
+		# For some reason, cudagraph prefers to see the zero grad after step
+		self.pi_optim.zero_grad(set_to_none=True)

-		return pi_loss.item()
+		return pi_loss.detach(), pi_grad_norm

 	@torch.no_grad()
 	def _td_target(self, next_z, reward, task):
 		"""
 		Compute the TD-target from a reward and the observation at the following time step.
-		
+
 		Args:
 			next_z (torch.Tensor): Latent state at the following time step.
 			reward (torch.Tensor): Reward at the current time step.
 			task (torch.Tensor): Task index (only used for multi-task experiments).
-		
+
 		Returns:
 			torch.Tensor: TD-target.
 		"""
@@ -218,22 +273,28 @@ class TDMPC2:
 	def update(self, buffer):
 		"""
 		Main update function. Corresponds to one iteration of model learning.
-		
+
 		Args:
 			buffer (common.buffer.Buffer): Replay buffer.
-		
+
 		Returns:
 			dict: Dictionary of training statistics.
 		"""
-		obs, action, reward, task = buffer.sample()
-	
+		with timeit("sample"):
+			obs, action, reward, task = buffer.sample()
+		kwargs = {}
+		if task is not None:
+			kwargs["task"] = task
+		torch.compiler.cudagraph_mark_step_begin()
+		return self._update(obs, action, reward, **kwargs)
+
+	def _update(self, obs, action, reward, task=None):
 		# Compute targets
 		with torch.no_grad():
 			next_z = self.model.encode(obs[1:], task)
 			td_targets = self._td_target(next_z, reward, task)

 		# Prepare for update
-		self.optim.zero_grad(set_to_none=True)
 		self.model.train()

 		# Latent rollout
@@ -241,25 +302,26 @@ class TDMPC2:
 		z = self.model.encode(obs[0], task)
 		zs[0] = z
 		consistency_loss = 0
-		for t in range(self.cfg.horizon):
-			z = self.model.next(z, action[t], task)
-			consistency_loss += F.mse_loss(z, next_z[t]) * self.cfg.rho**t
+		for t, (_action, _next_z) in enumerate(zip(action.unbind(0), next_z.unbind(0))):
+			z = self.model.next(z, _action, task)
+			consistency_loss = consistency_loss + F.mse_loss(z, _next_z) * self.cfg.rho**t
 			zs[t+1] = z

 		# Predictions
 		_zs = zs[:-1]
 		qs = self.model.Q(_zs, action, task, return_type='all')
 		reward_preds = self.model.reward(_zs, action, task)
-		
+
 		# Compute losses
 		reward_loss, value_loss = 0, 0
-		for t in range(self.cfg.horizon):
-			reward_loss += math.soft_ce(reward_preds[t], reward[t], self.cfg).mean() * self.cfg.rho**t
-			for q in range(self.cfg.num_q):
-				value_loss += math.soft_ce(qs[q][t], td_targets[t], self.cfg).mean() * self.cfg.rho**t
-		consistency_loss *= (1/self.cfg.horizon)
-		reward_loss *= (1/self.cfg.horizon)
-		value_loss *= (1/(self.cfg.horizon * self.cfg.num_q))
+		for t, (rew_pred_unbind, rew_unbind, td_targets_unbind, qs_unbind) in enumerate(zip(reward_preds.unbind(0), reward.unbind(0), td_targets.unbind(0), qs.unbind(1))):
+			reward_loss = reward_loss + math.soft_ce(rew_pred_unbind, rew_unbind, self.cfg).mean() * self.cfg.rho**t
+			for q, qs_unbind_unbind in enumerate(qs_unbind.unbind(0)):
+				value_loss = value_loss + math.soft_ce(qs_unbind_unbind, td_targets_unbind, self.cfg).mean() * self.cfg.rho**t
+
+		consistency_loss = consistency_loss / self.cfg.horizon
+		reward_loss = reward_loss / self.cfg.horizon
+		value_loss = value_loss / (self.cfg.horizon * self.cfg.num_q)
 		total_loss = (
 			self.cfg.consistency_coef * consistency_loss +
 			self.cfg.reward_coef * reward_loss +
@@ -270,21 +332,23 @@ class TDMPC2:
 		total_loss.backward()
 		grad_norm = torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.cfg.grad_clip_norm)
 		self.optim.step()
+		self.optim.zero_grad(set_to_none=True)

 		# Update policy
-		pi_loss = self.update_pi(zs.detach(), task)
+		pi_loss, pi_grad_norm = self.update_pi(zs.detach(), task)

 		# Update target Q-functions
 		self.model.soft_update_target_Q()

 		# Return training statistics
 		self.model.eval()
-		return {
-			"consistency_loss": float(consistency_loss.mean().item()),
-			"reward_loss": float(reward_loss.mean().item()),
-			"value_loss": float(value_loss.mean().item()),
+		return TensorDict({
+			"consistency_loss": consistency_loss,
+			"reward_loss": reward_loss,
+			"value_loss": value_loss,
 			"pi_loss": pi_loss,
-			"total_loss": float(total_loss.mean().item()),
-			"grad_norm": float(grad_norm),
-			"pi_scale": float(self.scale.value),
-		}
+			"total_loss": total_loss,
+			"grad_norm": grad_norm,
+			"pi_grad_norm": pi_grad_norm,
+			"pi_scale": self.scale.value,
+		}).detach().mean()
--- a/tdmpc2/train.py
+++ b/tdmpc2/train.py
@@ -1,6 +1,8 @@
 import os
 os.environ['MUJOCO_GL'] = 'egl'
 os.environ['LAZY_LEGACY_OP'] = '0'
+os.environ['TORCHDYNAMO_INLINE_INBUILT_NN_MODULES'] = "1"
+os.environ['TORCH_LOGS'] = "+recompiles"
 import warnings
 warnings.filterwarnings('ignore')
 import torch
@@ -16,10 +18,27 @@ from tdmpc2 import TDMPC2
 from trainer.offline_trainer import OfflineTrainer
 from trainer.online_trainer import OnlineTrainer
 from common.logger import Logger
-
+import dataclasses
+from typing import Any
+from omegaconf import OmegaConf
 torch.backends.cudnn.benchmark = True


+def cfg_to_dataclass(cfg, frozen=False):
+	# Converts an OmegaConf config to a dataclass, which will not cause graph breaks
+	cfg_dict = OmegaConf.to_container(cfg)
+	fields = []
+	for key, value in cfg_dict.items():
+		fields.append((key, Any, dataclasses.field(default_factory=lambda value_=value: value_)))
+
+	# Create the dataclass
+	dataclass_name = "Config"
+	dataclass = dataclasses.make_dataclass(dataclass_name, fields, frozen=frozen)
+	def get(self, val, default=None):
+		return getattr(self, val, default)
+	dataclass.get = get
+	return dataclass()
+
@hydra.main(config_name='config', config_path='.')
 def train(cfg: dict):
 	"""
@@ -47,6 +66,9 @@ def train(cfg: dict):
 	print(colored('Work dir:', 'yellow', attrs=['bold']), cfg.work_dir)

 	trainer_cls = OfflineTrainer if cfg.multitask else OnlineTrainer
+
+	cfg = cfg_to_dataclass(cfg)
+
 	trainer = trainer_cls(
 		cfg=cfg,
 		env=make_env(cfg),
--- a/tdmpc2/trainer/online_trainer.py
+++ b/tdmpc2/trainer/online_trainer.py
@@ -3,7 +3,7 @@ from time import time
 import numpy as np
 import torch
 from tensordict.tensordict import TensorDict
-
+from torchrl._utils import timeit
 from trainer.base import Trainer


@@ -57,61 +57,64 @@ class OnlineTrainer(Trainer):
 			action = torch.full_like(self.env.rand_act(), float('nan'))
 		if reward is None:
 			reward = torch.tensor(float('nan'))
-		td = TensorDict(dict(
+		td = TensorDict(
 			obs=obs,
 			action=action.unsqueeze(0),
 			reward=reward.unsqueeze(0),
-		), batch_size=(1,))
+		batch_size=(1,))
 		return td

 	def train(self):
 		"""Train a TD-MPC2 agent."""
-		train_metrics, done, eval_next = {}, True, True
+		train_metrics, done, eval_next = {}, True, False
 		while self._step <= self.cfg.steps:
+			with timeit("global-step"):
+				# Evaluate agent periodically
+				if self._step > 0 and self._step % self.cfg.eval_freq == 0:
+					eval_next = True

-			# Evaluate agent periodically
-			if self._step % self.cfg.eval_freq == 0:
-				eval_next = True
+				# Reset environment
+				if done or (self._step == self.cfg.seed_steps + 1):
+					if eval_next:
+						eval_metrics = self.eval()
+						eval_metrics.update(self.common_metrics())
+						self.logger.log(eval_metrics, 'eval')
+						eval_next = False

-			# Reset environment
-			if done:
-				if eval_next:
-					eval_metrics = self.eval()
-					eval_metrics.update(self.common_metrics())
-					self.logger.log(eval_metrics, 'eval')
-					eval_next = False
+					if self._step > 0:
+						train_metrics.update(
+							episode_reward=torch.tensor([td['reward'] for td in self._tds[1:]]).sum(),
+							episode_success=info['success'],
+						)
+						train_metrics.update(self.common_metrics())
+						train_metrics.update(timeit.todict())
+						self.logger.log(train_metrics, 'train')
+						self._ep_idx = self.buffer.add(torch.cat(self._tds))

-				if self._step > 0:
-					train_metrics.update(
-						episode_reward=torch.tensor([td['reward'] for td in self._tds[1:]]).sum(),
-						episode_success=info['success'],
-					)
-					train_metrics.update(self.common_metrics())
-					self.logger.log(train_metrics, 'train')
-					self._ep_idx = self.buffer.add(torch.cat(self._tds))
+					obs = self.env.reset()
+					self._tds = [self.to_td(obs)]

-				obs = self.env.reset()
-				self._tds = [self.to_td(obs)]
+				# Collect experience
+				with timeit("act"):
+					if self._step > self.cfg.seed_steps:
+						action = self.agent.act(obs, t0=len(self._tds)==1)
+					else:
+						action = self.env.rand_act()
+				obs, reward, done, info = self.env.step(action)
+				self._tds.append(self.to_td(obs, action, reward))

-			# Collect experience
-			if self._step > self.cfg.seed_steps:
-				action = self.agent.act(obs, t0=len(self._tds)==1)
-			else:
-				action = self.env.rand_act()
-			obs, reward, done, info = self.env.step(action)
-			self._tds.append(self.to_td(obs, action, reward))
+				# Update agent
+				if self._step >= self.cfg.seed_steps:
+					if self._step == self.cfg.seed_steps:
+						num_updates = self.cfg.seed_steps
+						print('Pretraining agent on seed data...')
+					else:
+						num_updates = 1
+					for _ in range(num_updates):
+						with timeit("update"):
+						   _train_metrics = self.agent.update(self.buffer)
+					train_metrics.update(_train_metrics)

-			# Update agent
-			if self._step >= self.cfg.seed_steps:
-				if self._step == self.cfg.seed_steps:
-					num_updates = self.cfg.seed_steps
-					print('Pretraining agent on seed data...')
-				else:
-					num_updates = 1
-				for _ in range(num_updates):
-					_train_metrics = self.agent.update(self.buffer)
-				train_metrics.update(_train_metrics)
+				self._step += 1

-			self._step += 1
-	
 		self.logger.finish(self.agent)