diff --git a/dreamer.py b/dreamer.py
index 62f9631..5c8a17c 100644
--- a/dreamer.py
+++ b/dreamer.py
@@ -217,10 +217,12 @@ def make_env(config, mode):
env = wrappers.OneHotAction(env)
elif suite == "crafter":
import envs.crafter as crafter
+
env = crafter.Crafter(task, config.size)
env = wrappers.OneHotAction(env)
elif suite == "minecraft":
import envs.minecraft as minecraft
+
env = minecraft.make_env(task, size=config.size, break_speed=config.break_speed)
env = wrappers.OneHotAction(env)
else:
@@ -294,7 +296,15 @@ def main(config):
logprob = random_actor.log_prob(action)
return {"action": action, "logprob": logprob}, None
- state = tools.simulate(random_agent, train_envs, train_eps, config.traindir, logger, limit=config.dataset_size, steps=prefill)
+ state = tools.simulate(
+ random_agent,
+ train_envs,
+ train_eps,
+ config.traindir,
+ logger,
+ limit=config.dataset_size,
+ steps=prefill,
+ )
logger.step += prefill * config.action_repeat
print(f"Logger: ({logger.step} steps).")
@@ -317,12 +327,29 @@ def main(config):
logger.write()
print("Start evaluation.")
eval_policy = functools.partial(agent, training=False)
- tools.simulate(eval_policy, eval_envs, eval_eps, config.evaldir, logger, is_eval=True, episodes=config.eval_episode_num)
+ tools.simulate(
+ eval_policy,
+ eval_envs,
+ eval_eps,
+ config.evaldir,
+ logger,
+ is_eval=True,
+ episodes=config.eval_episode_num,
+ )
if config.video_pred_log:
video_pred = agent._wm.video_pred(next(eval_dataset))
logger.video("eval_openl", to_np(video_pred))
print("Start training.")
- state = tools.simulate(agent, train_envs, train_eps, config.traindir, logger, limit=config.dataset_size, steps=config.eval_every, state=state)
+ state = tools.simulate(
+ agent,
+ train_envs,
+ train_eps,
+ config.traindir,
+ logger,
+ limit=config.dataset_size,
+ steps=config.eval_every,
+ state=state,
+ )
torch.save(agent.state_dict(), logdir / "latest_model.pt")
for env in train_envs + eval_envs:
try:
diff --git a/envs/atari.py b/envs/atari.py
index 4b5dfa0..1ef4c81 100644
--- a/envs/atari.py
+++ b/envs/atari.py
@@ -5,6 +5,7 @@ import numpy as np
class Atari:
LOCK = None
metadata = {}
+
def __init__(
self,
name,
diff --git a/envs/crafter.py b/envs/crafter.py
index 5119353..5d9e56e 100644
--- a/envs/crafter.py
+++ b/envs/crafter.py
@@ -4,6 +4,7 @@ import numpy as np
class Crafter:
metadata = {}
+
def __init__(self, task, size=(64, 64), seed=None):
assert task in ("reward", "noreward")
import crafter
diff --git a/envs/dmc.py b/envs/dmc.py
index d041f31..1907410 100644
--- a/envs/dmc.py
+++ b/envs/dmc.py
@@ -4,6 +4,7 @@ import numpy as np
class DeepMindControl:
metadata = {}
+
def __init__(self, name, action_repeat=1, size=(64, 64), camera=None):
domain, task = name.split("_", 1)
if domain == "cup": # Only domain with multiple words.
diff --git a/envs/minecraft.py b/envs/minecraft.py
index c94525b..f33f52d 100644
--- a/envs/minecraft.py
+++ b/envs/minecraft.py
@@ -3,152 +3,148 @@ from . import minecraft_base
import gym
+
def make_env(task, *args, **kwargs):
return {
- 'wood': MinecraftWood,
- 'climb': MinecraftClimb,
- 'diamond': MinecraftDiamond,
- }[task](*args, **kwargs)
+ "wood": MinecraftWood,
+ "climb": MinecraftClimb,
+ "diamond": MinecraftDiamond,
+ }[task](*args, **kwargs)
class MinecraftWood:
+ def __init__(self, *args, **kwargs):
+ actions = BASIC_ACTIONS
+ self.rewards = [
+ CollectReward("log", repeated=1),
+ HealthReward(),
+ ]
+ env = minecraft_base.MinecraftBase(actions, *args, **kwargs)
- def __init__(self, *args, **kwargs):
- actions = BASIC_ACTIONS
- self.rewards = [
- CollectReward('log', repeated=1),
- HealthReward(),
- ]
- env = minecraft_base.MinecraftBase(actions, *args, **kwargs)
-
- def step(self, action):
- obs, reward, done, info = self.env.step(action)
- reward = sum([fn(obs, self.env.inventory) for fn in self.rewards])
- obs['reward'] = reward
- return obs, reward, done, info
+ def step(self, action):
+ obs, reward, done, info = self.env.step(action)
+ reward = sum([fn(obs, self.env.inventory) for fn in self.rewards])
+ obs["reward"] = reward
+ return obs, reward, done, info
class MinecraftClimb:
+ def __init__(self, *args, **kwargs):
+ actions = BASIC_ACTIONS
+ env = minecraft_base.MinecraftBase(actions, *args, **kwargs)
+ self._previous = None
+ self._health_reward = HealthReward()
- def __init__(self, *args, **kwargs):
- actions = BASIC_ACTIONS
- env = minecraft_base.MinecraftBase(actions, *args, **kwargs)
- self._previous = None
- self._health_reward = HealthReward()
-
- def step(self, action):
- obs, reward, done, info = self.env.step(action)
- x, y, z = obs['log_player_pos']
- height = np.float32(y)
- if obs['is_first']:
- self._previous = height
- reward = height - self._previous
- reward += self._health_reward(obs)
- obs['reward'] = reward
- self._previous = height
- return obs, reward, done, info
+ def step(self, action):
+ obs, reward, done, info = self.env.step(action)
+ x, y, z = obs["log_player_pos"]
+ height = np.float32(y)
+ if obs["is_first"]:
+ self._previous = height
+ reward = height - self._previous
+ reward += self._health_reward(obs)
+ obs["reward"] = reward
+ self._previous = height
+ return obs, reward, done, info
class MinecraftDiamond(gym.Wrapper):
+ def __init__(self, *args, **kwargs):
+ actions = {
+ **BASIC_ACTIONS,
+ "craft_planks": dict(craft="planks"),
+ "craft_stick": dict(craft="stick"),
+ "craft_crafting_table": dict(craft="crafting_table"),
+ "place_crafting_table": dict(place="crafting_table"),
+ "craft_wooden_pickaxe": dict(nearbyCraft="wooden_pickaxe"),
+ "craft_stone_pickaxe": dict(nearbyCraft="stone_pickaxe"),
+ "craft_iron_pickaxe": dict(nearbyCraft="iron_pickaxe"),
+ "equip_stone_pickaxe": dict(equip="stone_pickaxe"),
+ "equip_wooden_pickaxe": dict(equip="wooden_pickaxe"),
+ "equip_iron_pickaxe": dict(equip="iron_pickaxe"),
+ "craft_furnace": dict(nearbyCraft="furnace"),
+ "place_furnace": dict(place="furnace"),
+ "smelt_iron_ingot": dict(nearbySmelt="iron_ingot"),
+ }
+ self.rewards = [
+ CollectReward("log", once=1),
+ CollectReward("planks", once=1),
+ CollectReward("stick", once=1),
+ CollectReward("crafting_table", once=1),
+ CollectReward("wooden_pickaxe", once=1),
+ CollectReward("cobblestone", once=1),
+ CollectReward("stone_pickaxe", once=1),
+ CollectReward("iron_ore", once=1),
+ CollectReward("furnace", once=1),
+ CollectReward("iron_ingot", once=1),
+ CollectReward("iron_pickaxe", once=1),
+ CollectReward("diamond", once=1),
+ HealthReward(),
+ ]
+ env = minecraft_base.MinecraftBase(actions, *args, **kwargs)
+ super().__init__(env)
- def __init__(self, *args, **kwargs):
- actions = {
- **BASIC_ACTIONS,
- 'craft_planks': dict(craft='planks'),
- 'craft_stick': dict(craft='stick'),
- 'craft_crafting_table': dict(craft='crafting_table'),
- 'place_crafting_table': dict(place='crafting_table'),
- 'craft_wooden_pickaxe': dict(nearbyCraft='wooden_pickaxe'),
- 'craft_stone_pickaxe': dict(nearbyCraft='stone_pickaxe'),
- 'craft_iron_pickaxe': dict(nearbyCraft='iron_pickaxe'),
- 'equip_stone_pickaxe': dict(equip='stone_pickaxe'),
- 'equip_wooden_pickaxe': dict(equip='wooden_pickaxe'),
- 'equip_iron_pickaxe': dict(equip='iron_pickaxe'),
- 'craft_furnace': dict(nearbyCraft='furnace'),
- 'place_furnace': dict(place='furnace'),
- 'smelt_iron_ingot': dict(nearbySmelt='iron_ingot'),
- }
- self.rewards = [
- CollectReward('log', once=1),
- CollectReward('planks', once=1),
- CollectReward('stick', once=1),
- CollectReward('crafting_table', once=1),
- CollectReward('wooden_pickaxe', once=1),
- CollectReward('cobblestone', once=1),
- CollectReward('stone_pickaxe', once=1),
- CollectReward('iron_ore', once=1),
- CollectReward('furnace', once=1),
- CollectReward('iron_ingot', once=1),
- CollectReward('iron_pickaxe', once=1),
- CollectReward('diamond', once=1),
- HealthReward(),
- ]
- env = minecraft_base.MinecraftBase(actions, *args, **kwargs)
- super().__init__(env)
+ def step(self, action):
+ obs, reward, done, info = self.env.step(action)
+ reward = sum([fn(obs, self.env.inventory) for fn in self.rewards])
+ obs["reward"] = reward
+ return obs, reward, done, info
- def step(self, action):
- obs, reward, done, info = self.env.step(action)
- reward = sum([fn(obs, self.env.inventory) for fn in self.rewards])
- obs['reward'] = reward
- return obs, reward, done, info
-
- def reset(self):
- obs = self.env.reset()
- # called for reset of reward calculations
- _ = sum([fn(obs, self.env.inventory) for fn in self.rewards])
- return obs
+ def reset(self):
+ obs = self.env.reset()
+ # called for reset of reward calculations
+ _ = sum([fn(obs, self.env.inventory) for fn in self.rewards])
+ return obs
class CollectReward:
+ def __init__(self, item, once=0, repeated=0):
+ self.item = item
+ self.once = once
+ self.repeated = repeated
+ self.previous = 0
+ self.maximum = 0
- def __init__(self, item, once=0, repeated=0):
- self.item = item
- self.once = once
- self.repeated = repeated
- self.previous = 0
- self.maximum = 0
-
- def __call__(self, obs, inventory):
- current = inventory[self.item]
- if obs['is_first']:
- self.previous = current
- self.maximum = current
- return 0
- reward = self.repeated * max(0, current - self.previous)
- if self.maximum == 0 and current > 0:
- reward += self.once
- self.previous = current
- self.maximum = max(self.maximum, current)
- return reward
+ def __call__(self, obs, inventory):
+ current = inventory[self.item]
+ if obs["is_first"]:
+ self.previous = current
+ self.maximum = current
+ return 0
+ reward = self.repeated * max(0, current - self.previous)
+ if self.maximum == 0 and current > 0:
+ reward += self.once
+ self.previous = current
+ self.maximum = max(self.maximum, current)
+ return reward
class HealthReward:
+ def __init__(self, scale=0.01):
+ self.scale = scale
+ self.previous = None
- def __init__(self, scale=0.01):
- self.scale = scale
- self.previous = None
-
- def __call__(self, obs, inventory=None):
- health = obs['health']
- if obs['is_first']:
- self.previous = health
- return 0
- reward = self.scale * (health - self.previous)
- self.previous = health
- return np.float32(reward)
+ def __call__(self, obs, inventory=None):
+ health = obs["health"]
+ if obs["is_first"]:
+ self.previous = health
+ return 0
+ reward = self.scale * (health - self.previous)
+ self.previous = health
+ return np.float32(reward)
BASIC_ACTIONS = {
- 'noop': dict(),
- 'attack': dict(attack=1),
- 'turn_up': dict(camera=(-15, 0)),
- 'turn_down': dict(camera=(15, 0)),
- 'turn_left': dict(camera=(0, -15)),
- 'turn_right': dict(camera=(0, 15)),
- 'forward': dict(forward=1),
- 'back': dict(back=1),
- 'left': dict(left=1),
- 'right': dict(right=1),
- 'jump': dict(jump=1, forward=1),
- 'place_dirt': dict(place='dirt'),
+ "noop": dict(),
+ "attack": dict(attack=1),
+ "turn_up": dict(camera=(-15, 0)),
+ "turn_down": dict(camera=(15, 0)),
+ "turn_left": dict(camera=(0, -15)),
+ "turn_right": dict(camera=(0, 15)),
+ "forward": dict(forward=1),
+ "back": dict(back=1),
+ "left": dict(left=1),
+ "right": dict(right=1),
+ "jump": dict(jump=1, forward=1),
+ "place_dirt": dict(place="dirt"),
}
diff --git a/envs/minecraft_base.py b/envs/minecraft_base.py
index 55f6e29..e3ca82d 100644
--- a/envs/minecraft_base.py
+++ b/envs/minecraft_base.py
@@ -4,215 +4,232 @@ import threading
import numpy as np
import gym
+
class MinecraftBase(gym.Env):
+ _LOCK = threading.Lock()
- _LOCK = threading.Lock()
+ def __init__(
+ self,
+ actions,
+ repeat=1,
+ size=(64, 64),
+ break_speed=100.0,
+ gamma=10.0,
+ sticky_attack=30,
+ sticky_jump=10,
+ pitch_limit=(-60, 60),
+ logs=True,
+ ):
+ if logs:
+ logging.basicConfig(level=logging.DEBUG)
+ self._repeat = repeat
+ self._size = size
+ if break_speed != 1.0:
+ sticky_attack = 0
- def __init__(
- self, actions,
- repeat=1,
- size=(64, 64),
- break_speed=100.0,
- gamma=10.0,
- sticky_attack=30,
- sticky_jump=10,
- pitch_limit=(-60, 60),
- logs=True,
- ):
- if logs:
- logging.basicConfig(level=logging.DEBUG)
- self._repeat = repeat
- self._size = size
- if break_speed != 1.0:
- sticky_attack = 0
+ # Make env
+ with self._LOCK:
+ from . import minecraft_minerl
- # Make env
- with self._LOCK:
- from .import minecraft_minerl
- self._env = minecraft_minerl.MineRLEnv(size, break_speed, gamma).make()
- self._inventory = {}
+ self._env = minecraft_minerl.MineRLEnv(size, break_speed, gamma).make()
+ self._inventory = {}
- # Observations
- self._inv_keys = [
- k for k in self._flatten(self._env.observation_space.spaces) if k.startswith('inventory/')
- if k != 'inventory/log2']
- self._step = 0
- self._max_inventory = None
- self._equip_enum = self._env.observation_space[
- 'equipped_items']['mainhand']['type'].values.tolist()
+ # Observations
+ self._inv_keys = [
+ k
+ for k in self._flatten(self._env.observation_space.spaces)
+ if k.startswith("inventory/")
+ if k != "inventory/log2"
+ ]
+ self._step = 0
+ self._max_inventory = None
+ self._equip_enum = self._env.observation_space["equipped_items"]["mainhand"][
+ "type"
+ ].values.tolist()
- # Actions
- self._noop_action = minecraft_minerl.NOOP_ACTION
- actions = self._insert_defaults(actions)
- self._action_names = tuple(actions.keys())
- self._action_values = tuple(actions.values())
- message = f'Minecraft action space ({len(self._action_values)}):'
- print(message, ', '.join(self._action_names))
- self._sticky_attack_length = sticky_attack
- self._sticky_attack_counter = 0
- self._sticky_jump_length = sticky_jump
- self._sticky_jump_counter = 0
- self._pitch_limit = pitch_limit
- self._pitch = 0
+ # Actions
+ self._noop_action = minecraft_minerl.NOOP_ACTION
+ actions = self._insert_defaults(actions)
+ self._action_names = tuple(actions.keys())
+ self._action_values = tuple(actions.values())
+ message = f"Minecraft action space ({len(self._action_values)}):"
+ print(message, ", ".join(self._action_names))
+ self._sticky_attack_length = sticky_attack
+ self._sticky_attack_counter = 0
+ self._sticky_jump_length = sticky_jump
+ self._sticky_jump_counter = 0
+ self._pitch_limit = pitch_limit
+ self._pitch = 0
- @property
- def observation_space(self):
- return gym.spaces.Dict(
- {
- 'image': gym.spaces.Box(0, 255, self._size + (3,), np.uint8),
- 'inventory': gym.spaces.Box(-np.inf, np.inf, (len(self._inv_keys),), dtype=np.float32),
- 'inventory_max': gym.spaces.Box(-np.inf, np.inf, (len(self._inv_keys),), dtype=np.float32),
- 'equipped': gym.spaces.Box(-np.inf, np.inf, (len(self._equip_enum),), dtype=np.float32),
- 'reward': gym.spaces.Box(-np.inf, np.inf, (1,), dtype=np.float32),
- 'health': gym.spaces.Box(-np.inf, np.inf, (1,), dtype=np.float32),
- 'hunger': gym.spaces.Box(-np.inf, np.inf, (1,), dtype=np.float32),
- 'breath': gym.spaces.Box(-np.inf, np.inf, (1,), dtype=np.float32),
- 'is_first': gym.spaces.Box(-np.inf, np.inf, (1,), dtype=np.uint8),
- 'is_last': gym.spaces.Box(-np.inf, np.inf, (1,), dtype=np.uint8),
- 'is_terminal': gym.spaces.Box(-np.inf, np.inf, (1,), dtype=np.uint8),
- **{f'log_{k}': gym.spaces.Box(-np.inf, np.inf, (1,), dtype=np.int64) for k in self._inv_keys},
- 'log_player_pos': gym.spaces.Box(-np.inf, np.inf, (3,), dtype=np.float32),
+ @property
+ def observation_space(self):
+ return gym.spaces.Dict(
+ {
+ "image": gym.spaces.Box(0, 255, self._size + (3,), np.uint8),
+ "inventory": gym.spaces.Box(
+ -np.inf, np.inf, (len(self._inv_keys),), dtype=np.float32
+ ),
+ "inventory_max": gym.spaces.Box(
+ -np.inf, np.inf, (len(self._inv_keys),), dtype=np.float32
+ ),
+ "equipped": gym.spaces.Box(
+ -np.inf, np.inf, (len(self._equip_enum),), dtype=np.float32
+ ),
+ "reward": gym.spaces.Box(-np.inf, np.inf, (1,), dtype=np.float32),
+ "health": gym.spaces.Box(-np.inf, np.inf, (1,), dtype=np.float32),
+ "hunger": gym.spaces.Box(-np.inf, np.inf, (1,), dtype=np.float32),
+ "breath": gym.spaces.Box(-np.inf, np.inf, (1,), dtype=np.float32),
+ "is_first": gym.spaces.Box(-np.inf, np.inf, (1,), dtype=np.uint8),
+ "is_last": gym.spaces.Box(-np.inf, np.inf, (1,), dtype=np.uint8),
+ "is_terminal": gym.spaces.Box(-np.inf, np.inf, (1,), dtype=np.uint8),
+ **{
+ f"log_{k}": gym.spaces.Box(-np.inf, np.inf, (1,), dtype=np.int64)
+ for k in self._inv_keys
+ },
+ "log_player_pos": gym.spaces.Box(
+ -np.inf, np.inf, (3,), dtype=np.float32
+ ),
+ }
+ )
+
+ @property
+ def action_space(self):
+ space = gym.spaces.discrete.Discrete(len(self._action_values))
+ space.discrete = True
+ return space
+
+ def step(self, action):
+ action = action.copy()
+ action = self._action_values[action]
+ action = self._action(action)
+ following = self._noop_action.copy()
+ for key in ("attack", "forward", "back", "left", "right"):
+ following[key] = action[key]
+ for act in [action] + ([following] * (self._repeat - 1)):
+ obs, reward, done, info = self._env.step(act)
+ if "error" in info:
+ done = True
+ break
+ obs["is_first"] = False
+ obs["is_last"] = bool(done)
+ obs["is_terminal"] = bool(info.get("is_terminal", done))
+
+ obs = self._obs(obs)
+ self._step += 1
+ assert "pov" not in obs, list(obs.keys())
+ return obs, reward, done, info
+
+ @property
+ def inventory(self):
+ return self._inventory
+
+ def reset(self):
+ # inventory will be added in _obs
+ self._inventory = {}
+ self._max_inventory = None
+
+ with self._LOCK:
+ obs = self._env.reset()
+ obs["is_first"] = True
+ obs["is_last"] = False
+ obs["is_terminal"] = False
+ obs = self._obs(obs)
+
+ self._step = 0
+ self._sticky_attack_counter = 0
+ self._sticky_jump_counter = 0
+ self._pitch = 0
+ return obs
+
+ def _obs(self, obs):
+ obs = self._flatten(obs)
+ obs["inventory/log"] += obs.pop("inventory/log2")
+ self._inventory = {
+ k.split("/", 1)[1]: obs[k] for k in self._inv_keys if k != "inventory/air"
}
- )
+ inventory = np.array([obs[k] for k in self._inv_keys], np.float32)
+ if self._max_inventory is None:
+ self._max_inventory = inventory
+ else:
+ self._max_inventory = np.maximum(self._max_inventory, inventory)
+ index = self._equip_enum.index(obs["equipped_items/mainhand/type"])
+ equipped = np.zeros(len(self._equip_enum), np.float32)
+ equipped[index] = 1.0
+ player_x = obs["location_stats/xpos"]
+ player_y = obs["location_stats/ypos"]
+ player_z = obs["location_stats/zpos"]
+ obs = {
+ "image": obs["pov"],
+ "inventory": inventory,
+ "inventory_max": self._max_inventory.copy(),
+ "equipped": equipped,
+ "health": np.float32(obs["life_stats/life"] / 20),
+ "hunger": np.float32(obs["life_stats/food"] / 20),
+ "breath": np.float32(obs["life_stats/air"] / 300),
+ "reward": 0.0,
+ "is_first": obs["is_first"],
+ "is_last": obs["is_last"],
+ "is_terminal": obs["is_terminal"],
+ **{f"log_{k}": np.int64(obs[k]) for k in self._inv_keys},
+ "log_player_pos": np.array([player_x, player_y, player_z], np.float32),
+ }
+ for key, value in obs.items():
+ space = self.observation_space[key]
+ if not isinstance(value, np.ndarray):
+ value = np.array(value)
+ assert (key, value, value.dtype, value.shape, space)
+ return obs
- @property
- def action_space(self):
- space = gym.spaces.discrete.Discrete(len(self._action_values))
- space.discrete = True
- return space
+ def _action(self, action):
+ if self._sticky_attack_length:
+ if action["attack"]:
+ self._sticky_attack_counter = self._sticky_attack_length
+ if self._sticky_attack_counter > 0:
+ action["attack"] = 1
+ action["jump"] = 0
+ self._sticky_attack_counter -= 1
+ if self._sticky_jump_length:
+ if action["jump"]:
+ self._sticky_jump_counter = self._sticky_jump_length
+ if self._sticky_jump_counter > 0:
+ action["jump"] = 1
+ action["forward"] = 1
+ self._sticky_jump_counter -= 1
+ if self._pitch_limit and action["camera"][0]:
+ lo, hi = self._pitch_limit
+ if not (lo <= self._pitch + action["camera"][0] <= hi):
+ action["camera"] = (0, action["camera"][1])
+ self._pitch += action["camera"][0]
+ return action
- def step(self, action):
- action = action.copy()
- action = self._action_values[action]
- action = self._action(action)
- following = self._noop_action.copy()
- for key in ('attack', 'forward', 'back', 'left', 'right'):
- following[key] = action[key]
- for act in [action] + ([following] * (self._repeat - 1)):
- obs, reward, done, info = self._env.step(act)
- if 'error' in info:
- done = True
- break
- obs['is_first'] = False
- obs['is_last'] = bool(done)
- obs['is_terminal'] = bool(info.get('is_terminal', done))
+ def _insert_defaults(self, actions):
+ actions = {name: action.copy() for name, action in actions.items()}
+ for key, default in self._noop_action.items():
+ for action in actions.values():
+ if key not in action:
+ action[key] = default
+ return actions
- obs = self._obs(obs)
- self._step += 1
- assert 'pov' not in obs, list(obs.keys())
- return obs, reward, done, info
+ def _flatten(self, nest, prefix=None):
+ result = {}
+ for key, value in nest.items():
+ key = prefix + "/" + key if prefix else key
+ if isinstance(value, gym.spaces.Dict):
+ value = value.spaces
+ if isinstance(value, dict):
+ result.update(self._flatten(value, key))
+ else:
+ result[key] = value
+ return result
- @property
- def inventory(self):
- return self._inventory
-
- def reset(self):
- # inventory will be added in _obs
- self._inventory = {}
- self._max_inventory = None
-
- with self._LOCK:
- obs = self._env.reset()
- obs['is_first'] = True
- obs['is_last'] = False
- obs['is_terminal'] = False
- obs = self._obs(obs)
-
- self._step = 0
- self._sticky_attack_counter = 0
- self._sticky_jump_counter = 0
- self._pitch = 0
- return obs
-
- def _obs(self, obs):
- obs = self._flatten(obs)
- obs['inventory/log'] += obs.pop('inventory/log2')
- self._inventory = {
- k.split('/', 1)[1]: obs[k] for k in self._inv_keys
- if k != 'inventory/air'}
- inventory = np.array([obs[k] for k in self._inv_keys], np.float32)
- if self._max_inventory is None:
- self._max_inventory = inventory
- else:
- self._max_inventory = np.maximum(self._max_inventory, inventory)
- index = self._equip_enum.index(obs['equipped_items/mainhand/type'])
- equipped = np.zeros(len(self._equip_enum), np.float32)
- equipped[index] = 1.0
- player_x = obs['location_stats/xpos']
- player_y = obs['location_stats/ypos']
- player_z = obs['location_stats/zpos']
- obs = {
- 'image': obs['pov'],
- 'inventory': inventory,
- 'inventory_max': self._max_inventory.copy(),
- 'equipped': equipped,
- 'health': np.float32(obs['life_stats/life'] / 20),
- 'hunger': np.float32(obs['life_stats/food'] / 20),
- 'breath': np.float32(obs['life_stats/air'] / 300),
- 'reward': 0.0,
- 'is_first': obs['is_first'],
- 'is_last': obs['is_last'],
- 'is_terminal': obs['is_terminal'],
- **{f'log_{k}': np.int64(obs[k]) for k in self._inv_keys},
- 'log_player_pos': np.array([player_x, player_y, player_z], np.float32),
- }
- for key, value in obs.items():
- space = self.observation_space[key]
- if not isinstance(value, np.ndarray):
- value = np.array(value)
- assert (key, value, value.dtype, value.shape, space)
- return obs
-
- def _action(self, action):
- if self._sticky_attack_length:
- if action['attack']:
- self._sticky_attack_counter = self._sticky_attack_length
- if self._sticky_attack_counter > 0:
- action['attack'] = 1
- action['jump'] = 0
- self._sticky_attack_counter -= 1
- if self._sticky_jump_length:
- if action['jump']:
- self._sticky_jump_counter = self._sticky_jump_length
- if self._sticky_jump_counter > 0:
- action['jump'] = 1
- action['forward'] = 1
- self._sticky_jump_counter -= 1
- if self._pitch_limit and action['camera'][0]:
- lo, hi = self._pitch_limit
- if not (lo <= self._pitch + action['camera'][0] <= hi):
- action['camera'] = (0, action['camera'][1])
- self._pitch += action['camera'][0]
- return action
-
- def _insert_defaults(self, actions):
- actions = {name: action.copy() for name, action in actions.items()}
- for key, default in self._noop_action.items():
- for action in actions.values():
- if key not in action:
- action[key] = default
- return actions
-
- def _flatten(self, nest, prefix=None):
- result = {}
- for key, value in nest.items():
- key = prefix + '/' + key if prefix else key
- if isinstance(value, gym.spaces.Dict):
- value = value.spaces
- if isinstance(value, dict):
- result.update(self._flatten(value, key))
- else:
- result[key] = value
- return result
-
- def _unflatten(self, flat):
- result = {}
- for key, value in flat.items():
- parts = key.split('/')
- node = result
- for part in parts[:-1]:
- if part not in node:
- node[part] = {}
- node = node[part]
- node[parts[-1]] = value
- return result
\ No newline at end of file
+ def _unflatten(self, flat):
+ result = {}
+ for key, value in flat.items():
+ parts = key.split("/")
+ node = result
+ for part in parts[:-1]:
+ if part not in node:
+ node[part] = {}
+ node = node[part]
+ node[parts[-1]] = value
+ return result
diff --git a/envs/minecraft_minerl.py b/envs/minecraft_minerl.py
index b412218..61653fb 100644
--- a/envs/minecraft_minerl.py
+++ b/envs/minecraft_minerl.py
@@ -6,145 +6,155 @@ from minerl.herobraine.hero.mc import INVERSE_KEYMAP
def edit_options(**kwargs):
- import os, pathlib, re
- for word in os.popen('pip3 --version').read().split(' '):
- if '-packages/pip' in word:
- break
- else:
- raise RuntimeError('Could not found python package directory.')
- packages = pathlib.Path(word).parent
- filename = packages / 'minerl/Malmo/Minecraft/run/options.txt'
- options = filename.read_text()
- if 'fovEffectScale:' not in options:
- options += 'fovEffectScale:1.0\n'
- if 'simulationDistance:' not in options:
- options += 'simulationDistance:12\n'
- for key, value in kwargs.items():
- assert f'{key}:' in options, key
- assert isinstance(value, str), (value, type(value))
- options = re.sub(f'{key}:.*\n', f'{key}:{value}\n', options)
- filename.write_text(options)
+ import os, pathlib, re
+
+ for word in os.popen("pip3 --version").read().split(" "):
+ if "-packages/pip" in word:
+ break
+ else:
+ raise RuntimeError("Could not found python package directory.")
+ packages = pathlib.Path(word).parent
+ filename = packages / "minerl/Malmo/Minecraft/run/options.txt"
+ options = filename.read_text()
+ if "fovEffectScale:" not in options:
+ options += "fovEffectScale:1.0\n"
+ if "simulationDistance:" not in options:
+ options += "simulationDistance:12\n"
+ for key, value in kwargs.items():
+ assert f"{key}:" in options, key
+ assert isinstance(value, str), (value, type(value))
+ options = re.sub(f"{key}:.*\n", f"{key}:{value}\n", options)
+ filename.write_text(options)
edit_options(
- difficulty='2',
- renderDistance='6',
- simulationDistance='6',
- fovEffectScale='0.0',
- ao='1',
- gamma='5.0',
+ difficulty="2",
+ renderDistance="6",
+ simulationDistance="6",
+ fovEffectScale="0.0",
+ ao="1",
+ gamma="5.0",
)
class MineRLEnv(EnvSpec):
+ def __init__(self, resolution=(64, 64), break_speed=50, gamma=10.0):
+ self.resolution = resolution
+ self.break_speed = break_speed
+ self.gamma = gamma
+ super().__init__(name="MineRLEnv-v1")
- def __init__(self, resolution=(64, 64), break_speed=50, gamma=10.0):
- self.resolution = resolution
- self.break_speed = break_speed
- self.gamma = gamma
- super().__init__(name='MineRLEnv-v1')
+ def create_agent_start(self):
+ return [
+ BreakSpeedMultiplier(self.break_speed),
+ ]
- def create_agent_start(self):
- return [
- BreakSpeedMultiplier(self.break_speed),
- ]
+ def create_agent_handlers(self):
+ return []
- def create_agent_handlers(self):
- return []
+ def create_server_world_generators(self):
+ return [handlers.DefaultWorldGenerator(force_reset=True)]
- def create_server_world_generators(self):
- return [handlers.DefaultWorldGenerator(force_reset=True)]
+ def create_server_quit_producers(self):
+ return [handlers.ServerQuitWhenAnyAgentFinishes()]
- def create_server_quit_producers(self):
- return [handlers.ServerQuitWhenAnyAgentFinishes()]
+ def create_server_initial_conditions(self):
+ return [
+ handlers.TimeInitialCondition(
+ allow_passage_of_time=True,
+ start_time=0,
+ ),
+ handlers.SpawningInitialCondition(
+ allow_spawning=True,
+ ),
+ ]
- def create_server_initial_conditions(self):
- return [
- handlers.TimeInitialCondition(
- allow_passage_of_time=True,
- start_time=0,
- ),
- handlers.SpawningInitialCondition(
- allow_spawning=True,
- )
- ]
+ def create_observables(self):
+ return [
+ handlers.POVObservation(self.resolution),
+ handlers.FlatInventoryObservation(mc.ALL_ITEMS),
+ handlers.EquippedItemObservation(
+ mc.ALL_ITEMS, _default="air", _other="other"
+ ),
+ handlers.ObservationFromCurrentLocation(),
+ handlers.ObservationFromLifeStats(),
+ ]
- def create_observables(self):
- return [
- handlers.POVObservation(self.resolution),
- handlers.FlatInventoryObservation(mc.ALL_ITEMS),
- handlers.EquippedItemObservation(
- mc.ALL_ITEMS, _default='air', _other='other'),
- handlers.ObservationFromCurrentLocation(),
- handlers.ObservationFromLifeStats(),
- ]
+ def create_actionables(self):
+ kw = dict(_other="none", _default="none")
+ return [
+ handlers.KeybasedCommandAction("forward", INVERSE_KEYMAP["forward"]),
+ handlers.KeybasedCommandAction("back", INVERSE_KEYMAP["back"]),
+ handlers.KeybasedCommandAction("left", INVERSE_KEYMAP["left"]),
+ handlers.KeybasedCommandAction("right", INVERSE_KEYMAP["right"]),
+ handlers.KeybasedCommandAction("jump", INVERSE_KEYMAP["jump"]),
+ handlers.KeybasedCommandAction("sneak", INVERSE_KEYMAP["sneak"]),
+ handlers.KeybasedCommandAction("attack", INVERSE_KEYMAP["attack"]),
+ handlers.CameraAction(),
+ handlers.PlaceBlock(["none"] + mc.ALL_ITEMS, **kw),
+ handlers.EquipAction(["none"] + mc.ALL_ITEMS, **kw),
+ handlers.CraftAction(["none"] + mc.ALL_ITEMS, **kw),
+ handlers.CraftNearbyAction(["none"] + mc.ALL_ITEMS, **kw),
+ handlers.SmeltItemNearby(["none"] + mc.ALL_ITEMS, **kw),
+ ]
- def create_actionables(self):
- kw = dict(_other='none', _default='none')
- return [
- handlers.KeybasedCommandAction('forward', INVERSE_KEYMAP['forward']),
- handlers.KeybasedCommandAction('back', INVERSE_KEYMAP['back']),
- handlers.KeybasedCommandAction('left', INVERSE_KEYMAP['left']),
- handlers.KeybasedCommandAction('right', INVERSE_KEYMAP['right']),
- handlers.KeybasedCommandAction('jump', INVERSE_KEYMAP['jump']),
- handlers.KeybasedCommandAction('sneak', INVERSE_KEYMAP['sneak']),
- handlers.KeybasedCommandAction('attack', INVERSE_KEYMAP['attack']),
- handlers.CameraAction(),
- handlers.PlaceBlock(['none'] + mc.ALL_ITEMS, **kw),
- handlers.EquipAction(['none'] + mc.ALL_ITEMS, **kw),
- handlers.CraftAction(['none'] + mc.ALL_ITEMS, **kw),
- handlers.CraftNearbyAction(['none'] + mc.ALL_ITEMS, **kw),
- handlers.SmeltItemNearby(['none'] + mc.ALL_ITEMS, **kw),
- ]
+ def is_from_folder(self, folder):
+ return folder == "none"
- def is_from_folder(self, folder):
- return folder == 'none'
+ def get_docstring(self):
+ return ""
- def get_docstring(self):
- return ''
+ def determine_success_from_rewards(self, rewards):
+ return True
- def determine_success_from_rewards(self, rewards):
- return True
+ def create_rewardables(self):
+ return []
- def create_rewardables(self):
- return []
+ def create_server_decorators(self):
+ return []
- def create_server_decorators(self):
- return []
+ def create_mission_handlers(self):
+ return []
- def create_mission_handlers(self):
- return []
-
- def create_monitors(self):
- return []
+ def create_monitors(self):
+ return []
class BreakSpeedMultiplier(handler.Handler):
+ def __init__(self, multiplier=1.0):
+ self.multiplier = multiplier
- def __init__(self, multiplier=1.0):
- self.multiplier = multiplier
+ def to_string(self):
+ return f"break_speed({self.multiplier})"
- def to_string(self):
- return f'break_speed({self.multiplier})'
-
- def xml_template(self):
- return '{{multiplier}}'
+ def xml_template(self):
+ return "{{multiplier}}"
class Gamma(handler.Handler):
+ def __init__(self, gamma=2.0):
+ self.gamma = gamma
- def __init__(self, gamma=2.0):
- self.gamma = gamma
+ def to_string(self):
+ return f"gamma({self.gamma})"
- def to_string(self):
- return f'gamma({self.gamma})'
-
- def xml_template(self):
- return '{{gamma}}'
+ def xml_template(self):
+ return "{{gamma}}"
NOOP_ACTION = dict(
- camera=(0, 0), forward=0, back=0, left=0, right=0, attack=0, sprint=0,
- jump=0, sneak=0, craft='none', nearbyCraft='none', nearbySmelt='none',
- place='none', equip='none',
+ camera=(0, 0),
+ forward=0,
+ back=0,
+ left=0,
+ right=0,
+ attack=0,
+ sprint=0,
+ jump=0,
+ sneak=0,
+ craft="none",
+ nearbyCraft="none",
+ nearbySmelt="none",
+ place="none",
+ equip="none",
)
diff --git a/envs/wrappers.py b/envs/wrappers.py
index af52602..b73e156 100644
--- a/envs/wrappers.py
+++ b/envs/wrappers.py
@@ -52,7 +52,6 @@ class OneHotAction(gym.Wrapper):
super().__init__(env)
self._random = np.random.RandomState()
-
def action_space(self):
shape = (self.env.action_space.n,)
space = gym.spaces.Box(low=0, high=1, shape=shape, dtype=np.float32)
@@ -83,7 +82,6 @@ class RewardObs(gym.Wrapper):
def __init__(self, env):
super().__init__(env)
-
def observation_space(self):
spaces = self.env.observation_space.spaces
if "reward" not in spaces:
@@ -110,17 +108,16 @@ class SelectAction(gym.Wrapper):
super().__init__(env)
self._key = key
-
def step(self, action):
return self.env.step(action[self._key])
+
class UUID(gym.Wrapper):
def __init__(self, env):
super().__init__(env)
timestamp = datetime.datetime.now().strftime("%Y%m%dT%H%M%S")
self.id = f"{timestamp}-{str(uuid.uuid4().hex)}"
-
def reset(self):
timestamp = datetime.datetime.now().strftime("%Y%m%dT%H%M%S")
self.id = f"{timestamp}-{str(uuid.uuid4().hex)}"
diff --git a/parallel.py b/parallel.py
index 792c5b0..2b5e48f 100644
--- a/parallel.py
+++ b/parallel.py
@@ -194,6 +194,7 @@ class Future:
self._complete = True
return self._result
+
class Damy:
def __init__(self, env):
self._env = env
@@ -202,7 +203,7 @@ class Damy:
return getattr(self._env, name)
def step(self, action):
- return lambda :self._env.step(action)
+ return lambda: self._env.step(action)
def reset(self):
- return lambda :self._env.reset()
\ No newline at end of file
+ return lambda: self._env.reset()
diff --git a/tools.py b/tools.py
index 4b14efa..c13e438 100644
--- a/tools.py
+++ b/tools.py
@@ -122,7 +122,18 @@ class Logger:
self._writer.add_video(name, value, step, 16)
-def simulate(agent, envs, cache, directory, logger, is_eval=False, limit=None, steps=0, episodes=0, state=None):
+def simulate(
+ agent,
+ envs,
+ cache,
+ directory,
+ logger,
+ is_eval=False,
+ limit=None,
+ steps=0,
+ episodes=0,
+ state=None,
+):
# initialize or unpack simulation state
if state is None:
step, episode = 0, 0
@@ -200,7 +211,7 @@ def simulate(agent, envs, cache, directory, logger, is_eval=False, limit=None, s
logger.scalar(f"train_episodes", len(cache))
logger.write(step=logger.step)
else:
- if not 'eval_lengths' in locals():
+ if not "eval_lengths" in locals():
eval_lengths = []
eval_scores = []
eval_done = False
@@ -278,6 +289,7 @@ class CollectDataset:
self.add_to_cache(transition)
return obs
+
def add_to_cache(cache, id, transition):
if id not in cache:
cache[id] = dict()
@@ -292,6 +304,7 @@ def add_to_cache(cache, id, transition):
else:
cache[id][key].append(convert(val))
+
def erase_over_episodes(cache, dataset_size):
step_in_dataset = 0
for key, ep in reversed(sorted(cache.items(), key=lambda x: x[0])):
@@ -304,6 +317,7 @@ def erase_over_episodes(cache, dataset_size):
del cache[key]
return step_in_dataset
+
def convert(value, precision=32):
value = np.array(value)
if np.issubdtype(value.dtype, np.floating):
@@ -318,6 +332,7 @@ def convert(value, precision=32):
raise NotImplementedError(value.dtype)
return value.astype(dtype)
+
def save_episodes(directory, episodes):
directory = pathlib.Path(directory).expanduser()
directory.mkdir(parents=True, exist_ok=True)