naive support for pixels

2023-12-22 07:34:40 -08:00
parent 445af9d81d
commit bfb1971898
6 changed files with 118 additions and 14 deletions
--- a/tdmpc2/common/layers.py
+++ b/tdmpc2/common/layers.py
@@ -24,6 +24,44 @@ class Ensemble(nn.Module):
 		return 'Vectorized ' + self._repr


+class ShiftAug(nn.Module):
+	"""
+	Random shift image augmentation.
+	Adapted from https://github.com/facebookresearch/drqv2
+	"""
+	def __init__(self, pad=3):
+		super().__init__()
+		self.pad = pad
+
+	def forward(self, x):
+		x = x.float()
+		n, _, h, w = x.size()
+		assert h == w
+		padding = tuple([self.pad] * 4)
+		x = F.pad(x, padding, 'replicate')
+		eps = 1.0 / (h + 2 * self.pad)
+		arange = torch.linspace(-1.0 + eps, 1.0 - eps, h + 2 * self.pad, device=x.device, dtype=x.dtype)[:h]
+		arange = arange.unsqueeze(0).repeat(h, 1).unsqueeze(2)
+		base_grid = torch.cat([arange, arange.transpose(1, 0)], dim=2)
+		base_grid = base_grid.unsqueeze(0).repeat(n, 1, 1, 1)
+		shift = torch.randint(0, 2 * self.pad + 1, size=(n, 1, 1, 2), device=x.device, dtype=x.dtype)
+		shift *= 2.0 / (h + 2 * self.pad)
+		grid = base_grid + shift
+		return F.grid_sample(x, grid, padding_mode='zeros', align_corners=False)
+
+
+class PixelPreprocess(nn.Module):
+	"""
+	Normalizes pixel observations to [-0.5, 0.5].
+	"""
+
+	def __init__(self):
+		super().__init__()
+
+	def forward(self, x):
+		return x.div_(255.).sub_(0.5)
+
+
 class SimNorm(nn.Module):
 	"""
 	Simplicial normalization.
@@ -69,16 +107,6 @@ class NormedLinear(nn.Linear):
 			f"act={self.act.__class__.__name__})"


-def enc(cfg, out={}):
-	"""
-	Returns a dictionary of encoders for each observation in the dict.
-	"""
-	for k in cfg.obs_shape.keys():
-		assert k == 'state'
-		out[k] = mlp(cfg.obs_shape[k][0] + cfg.task_dim, max(cfg.num_enc_layers-1, 1)*[cfg.enc_dim], cfg.latent_dim, act=SimNorm(cfg))
-	return nn.ModuleDict(out)
-
-
 def mlp(in_dim, mlp_dims, out_dim, act=None, dropout=0.):
 	"""
 	Basic building block of TD-MPC2.
@@ -92,3 +120,34 @@ def mlp(in_dim, mlp_dims, out_dim, act=None, dropout=0.):
 		mlp.append(NormedLinear(dims[i], dims[i+1], dropout=dropout*(i==0)))
 	mlp.append(NormedLinear(dims[-2], dims[-1], act=act) if act else nn.Linear(dims[-2], dims[-1]))
 	return nn.Sequential(*mlp)
+
+
+def conv(in_shape, num_channels, act=None):
+	"""
+	Basic convolutional encoder for TD-MPC2 with raw image observations.
+	4 layers of convolution with ReLU activations, followed by a linear layer.
+	"""
+	assert in_shape[-1] == 64 # assumes rgb observations to be 64x64
+	layers = [
+		ShiftAug(), PixelPreprocess(),
+		nn.Conv2d(in_shape[0], num_channels, 7, stride=2), nn.ReLU(inplace=True),
+		nn.Conv2d(num_channels, num_channels, 5, stride=2), nn.ReLU(inplace=True),
+		nn.Conv2d(num_channels, num_channels, 3, stride=2), nn.ReLU(inplace=True),
+		nn.Conv2d(num_channels, num_channels, 3, stride=1), nn.Flatten()]
+	if act:
+		layers.append(act)
+	return nn.Sequential(*layers)
+
+
+def enc(cfg, out={}):
+	"""
+	Returns a dictionary of encoders for each observation in the dict.
+	"""
+	for k in cfg.obs_shape.keys():
+		if k == 'state':
+			out[k] = mlp(cfg.obs_shape[k][0] + cfg.task_dim, max(cfg.num_enc_layers-1, 1)*[cfg.enc_dim], cfg.latent_dim, act=SimNorm(cfg))
+		elif k == 'rgb':
+			out[k] = conv(cfg.obs_shape[k], cfg.num_channels, act=SimNorm(cfg))
+		else:
+			raise NotImplementedError(f"Encoder for observation type {k} not implemented.")
+	return nn.ModuleDict(out)
--- a/tdmpc2/common/logger.py
+++ b/tdmpc2/common/logger.py
@@ -49,11 +49,11 @@ def print_run(cfg):
 			prefix + colored(f'{k.capitalize()+":":<15}', color, attrs=attrs), _limstr(v)
 		)

-	obs_dim = cfg.obs_shape['state'][0] if 'state' in cfg.obs_shape else cfg.obs_shape[0]
+	observations  = ", ".join([str(v) for v in cfg.obs_shape.values()])
 	kvs = [
 		("task", cfg.task_title),
 		("steps", f"{int(cfg.steps):,}"),
-		("observations", obs_dim),
+		("observations", observations),
 		("actions", cfg.action_dim),
 		("experiment", cfg.exp_name),
 	]
--- a/tdmpc2/common/world_model.py
+++ b/tdmpc2/common/world_model.py
@@ -97,7 +97,9 @@ class WorldModel(nn.Module):
 		"""
 		if self.cfg.multitask:
 			obs = self.task_emb(obs, task)
-		return self._encoder['state'](obs)
+		if self.cfg.obs == 'rgb' and obs.ndim == 5:
+			return torch.stack([self._encoder[self.cfg.obs](o) for o in obs])
+		return self._encoder[self.cfg.obs](obs)

 	def next(self, z, a, task):
 		"""
--- a/tdmpc2/config.yaml
+++ b/tdmpc2/config.yaml
@@ -3,6 +3,7 @@ defaults:

 # environment
 task: dog-run
+obs: state

 # evaluation
 checkpoint: ???
@@ -52,6 +53,7 @@ vmax: +10
 model_size: ???
 num_enc_layers: 2
 enc_dim: 256
+num_channels: 32
 mlp_dim: 512
 latent_dim: 512
 task_dim: 96
--- a/tdmpc2/envs/init.py
+++ b/tdmpc2/envs/init.py
@@ -4,6 +4,7 @@ import warnings
 import gym

 from envs.wrappers.multitask import MultitaskWrapper
+from envs.wrappers.pixels import PixelWrapper
 from envs.wrappers.tensor import TensorWrapper
 from envs.dmcontrol import make_env as make_dm_control_env
 from envs.maniskill import make_env as make_maniskill_env
@@ -52,10 +53,12 @@ def make_env(cfg):
 		if env is None:
 			raise UnknownTaskError(cfg.task)
 		env = TensorWrapper(env)
+	if cfg.get('obs', 'state') == 'rgb':
+		env = PixelWrapper(cfg, env)
 	try: # Dict
 		cfg.obs_shape = {k: v.shape for k, v in env.observation_space.spaces.items()}
 	except: # Box
-		cfg.obs_shape = {'state': env.observation_space.shape}
+		cfg.obs_shape = {cfg.get('obs', 'state'): env.observation_space.shape}
 	cfg.action_dim = env.action_space.shape[0]
 	cfg.episode_length = env.max_episode_steps
 	cfg.seed_steps = max(1000, 5*cfg.episode_length)
--- a/tdmpc2/envs/wrappers/pixels.py
+++ b/tdmpc2/envs/wrappers/pixels.py
@@ -0,0 +1,38 @@
+from collections import deque
+
+import gym
+import numpy as np
+import torch
+
+
+class PixelWrapper(gym.Wrapper):
+	"""
+	Wrapper for pixel observations. Compatible with DMControl environments.
+	"""
+
+	def __init__(self, cfg, env, num_frames=3, render_size=64):
+		super().__init__(env)
+		self.cfg = cfg
+		self.env = env
+		self.observation_space = gym.spaces.Box(
+			low=0, high=255, shape=(num_frames*3, render_size, render_size), dtype=np.uint8
+		)
+		self._frames = deque([], maxlen=num_frames)
+		self._render_size = render_size
+
+	def _get_obs(self):
+		frame = self.env.render(
+			mode='rgb_array', width=self._render_size, height=self._render_size
+		).transpose(2, 0, 1)
+		self._frames.append(frame)
+		return torch.from_numpy(np.concatenate(self._frames))
+
+	def reset(self):
+		self.env.reset()
+		for _ in range(self._frames.maxlen):
+			obs = self._get_obs()
+		return obs
+
+	def step(self, action):
+		_, reward, done, info = self.env.step(action)
+		return self._get_obs(), reward, done, info