diff --git a/cooperative_cuisine/environment.py b/cooperative_cuisine/environment.py index da168d72e1998ef77a93662c80372a1401316d12..edfaf4ca9f4b31a5bce9d0cfc6f290373696b18f 100644 --- a/cooperative_cuisine/environment.py +++ b/cooperative_cuisine/environment.py @@ -143,7 +143,6 @@ class Environment: self.env_time: datetime = create_init_env_time() """the internal time of the environment. An environment starts always with the time from `create_init_env_time`.""" - self.random: Random = Random(seed) """Random instance.""" self.hook: Hooks = Hooks(self) @@ -478,7 +477,6 @@ class Environment: """ # self.hook(PRE_STEP, passed_time=passed_time) self.env_time += passed_time - if self.game_ended: self.hook(GAME_ENDED_STEP, served_meals=self.order_manager.served_meals) else: diff --git a/cooperative_cuisine/reinforcement_learning/config/environment/environment_config_rl.yaml b/cooperative_cuisine/reinforcement_learning/config/environment/environment_config_rl.yaml index 779311b92239de553747b797348dc3fc5f5c06e3..5151d980a30826f68925fc320b03e6275a5077d1 100644 --- a/cooperative_cuisine/reinforcement_learning/config/environment/environment_config_rl.yaml +++ b/cooperative_cuisine/reinforcement_learning/config/environment/environment_config_rl.yaml @@ -52,7 +52,7 @@ layout_chars: orders: meals: - all: true + all: false # if all: false -> only orders for these meals are generated # TODO: what if this list is empty? list: diff --git a/cooperative_cuisine/reinforcement_learning/config/environment/order_config.yaml b/cooperative_cuisine/reinforcement_learning/config/environment/order_config.yaml index 9162088897ece096ce49202e6349271d7fc5006f..9cc3de7dbd523c5f814d87d86cb6ac51e807b999 100644 --- a/cooperative_cuisine/reinforcement_learning/config/environment/order_config.yaml +++ b/cooperative_cuisine/reinforcement_learning/config/environment/order_config.yaml @@ -30,4 +30,4 @@ orders: sample_on_serving: false # Sample the delay for the next order only after a meal was served. serving_not_ordered_meals: true - # can meals that are not ordered be served / dropped on the serving window + # can meals that are not ordered be served / dropped on the serving window \ No newline at end of file diff --git a/cooperative_cuisine/reinforcement_learning/config/model/A2C.yaml b/cooperative_cuisine/reinforcement_learning/config/model/A2C.yaml index 3718bb3a2d7b5f29988351babf99bd90739a5f1e..7e01b0fd6956523ad006c512bb9020ec72ff9fe2 100644 --- a/cooperative_cuisine/reinforcement_learning/config/model/A2C.yaml +++ b/cooperative_cuisine/reinforcement_learning/config/model/A2C.yaml @@ -1,5 +1,5 @@ env_id: "overcooked" -policy_type: "MlpPolicy" +policy: "MlpPolicy" model_type: "A2C" total_timesteps: 3_000_000 # hendric sagt eher so 300_000_000 schritte number_envs_parallel: 64 @@ -18,7 +18,7 @@ rollout_buffer_class: None rollout_buffer_kwargs: None normalize_advantage: False stats_window_size: 100 -tensorboard_log: None +tensorboard_log: f"logs/reinforcement_learning/runs/{0}" policy_kwargs: None verbose: 0 seed: None diff --git a/cooperative_cuisine/reinforcement_learning/config/model/DQN.yaml b/cooperative_cuisine/reinforcement_learning/config/model/DQN.yaml index 84a55adceaa3e370ecd48e0ceae6a08f413c4865..9fb00ed1d0ef51ed156b4f3d77c51327e6f95fb8 100644 --- a/cooperative_cuisine/reinforcement_learning/config/model/DQN.yaml +++ b/cooperative_cuisine/reinforcement_learning/config/model/DQN.yaml @@ -1,5 +1,5 @@ env_id: "overcooked" -policy_type: "MlpPolicy" +policy: "MlpPolicy" model_type: "DQN" total_timesteps: 3_000_000 # hendric sagt eher so 300_000_000 schritte number_envs_parallel: 64 @@ -20,7 +20,7 @@ exploration_initial_eps: 1.0 exploration_final_eps: 0.05 max_grad_norm: 10 stats_window_size: 100 -tensorboard_log: None +tensorboard_log: f"logs/reinforcement_learning/runs/{0}" policy_kwargs: None verbose: 0 seed: None diff --git a/cooperative_cuisine/reinforcement_learning/config/model/PPO.yaml b/cooperative_cuisine/reinforcement_learning/config/model/PPO.yaml index fb4a6d929d7c0abe4f24888c030068dfa894674d..f253155af0dd51b0f660dfb39e7f55a3053694c7 100644 --- a/cooperative_cuisine/reinforcement_learning/config/model/PPO.yaml +++ b/cooperative_cuisine/reinforcement_learning/config/model/PPO.yaml @@ -1,5 +1,5 @@ env_id: "overcooked" -policy_type: "MlpPolicy" +policy: "MlpPolicy" model_type: "PPO" total_timesteps: 3_000_000 # hendric sagt eher so 300_000_000 schritte number_envs_parallel: 64 @@ -21,9 +21,9 @@ rollout_buffer_class: None rollout_buffer_kwargs: None target_kl: None stats_window_size: 100 -tensorboard_log: None +tensorboard_log: f"logs/reinforcement_learning/runs/{0}" policy_kwargs: None verbose: 0 seed: None -device: 'auto' +device: "auto" _init_setup_model: True diff --git a/cooperative_cuisine/reinforcement_learning/gym_env.py b/cooperative_cuisine/reinforcement_learning/gym_env.py index a779466a24bd4e88a1f8d5c32a4e6e5258c4db5f..1ca2828b74f2ec73c93cf020b432bac02462beb6 100644 --- a/cooperative_cuisine/reinforcement_learning/gym_env.py +++ b/cooperative_cuisine/reinforcement_learning/gym_env.py @@ -199,6 +199,14 @@ class EnvGymWrapper(Env): self.prev_score = 0 def step(self, action): + # this is simply a work-around to enable no action which is necessary for the play_gym.py + if action == 8: + observation = self.get_observation() + reward = self.env.score - self.prev_score + terminated = self.env.game_ended + truncated = self.env.game_ended + info = {} + return observation, reward, terminated, truncated, info simple_action = self.action_space_map[action] env_action = get_env_action( self.player_id, simple_action, self.global_step_time diff --git a/cooperative_cuisine/reinforcement_learning/play_gym.py b/cooperative_cuisine/reinforcement_learning/play_gym.py new file mode 100644 index 0000000000000000000000000000000000000000..96c7e73636cf46f961323885a6fbf117184fdffd --- /dev/null +++ b/cooperative_cuisine/reinforcement_learning/play_gym.py @@ -0,0 +1,15 @@ +import hydra +from gymnasium.utils.play import play +from omegaconf import DictConfig + +from gym_env import EnvGymWrapper, SimpleActionSpace + + +@hydra.main(version_base="1.3", config_path="config", config_name="rl_config") +def main(cfg: DictConfig): + env = EnvGymWrapper(cfg) + env.render_mode = "rgb_array" + play(env, keys_to_action={"a": 2, "d": 3, "w": 0, "s": 1, " ": 4, "k": 5}, noop=8) + +if __name__ == "__main__": + main() diff --git a/cooperative_cuisine/reinforcement_learning/run_single_agent.py b/cooperative_cuisine/reinforcement_learning/run_single_agent.py index 846564377b47518bd75ce6af8f8bc112961b90e8..c25e7265f1f79ac370c70ed0edbdc07d08744ab0 100644 --- a/cooperative_cuisine/reinforcement_learning/run_single_agent.py +++ b/cooperative_cuisine/reinforcement_learning/run_single_agent.py @@ -4,21 +4,32 @@ import cv2 from stable_baselines3 import DQN from gym_env import EnvGymWrapper +import hydra +from omegaconf import DictConfig, OmegaConf -model_save_path = "logs/reinforcement_learning/rl_agent_checkpoints/overcooked_DQN.zip" -model_class = DQN -model = model_class.load(model_save_path) -env = EnvGymWrapper() - -# check_env(env) -obs, info = env.reset() -while True: - action, _states = model.predict(obs, deterministic=False) - obs, reward, terminated, truncated, info = env.step(int(action)) - print(reward) - rgb_img = env.render() - cv2.imshow("env", rgb_img) - cv2.waitKey(0) - if terminated or truncated: - obs, info = env.reset() - time.sleep(1 / env.metadata["render_fps"]) + + + +@hydra.main(version_base="1.3", config_path="config", config_name="rl_config") +def main(cfg: DictConfig): + model_save_path = "logs/reinforcement_learning/rl_agent_checkpoints/overcooked_DQN.zip" + model_class = DQN + model = model_class.load(model_save_path) + env = EnvGymWrapper(cfg) + + # check_env(env) + obs, info = env.reset() + while True: + action, _states = model.predict(obs, deterministic=False) + obs, reward, terminated, truncated, info = env.step(int(action)) + print(reward) + rgb_img = env.render() + cv2.imshow("env", rgb_img) + cv2.waitKey(0) + if terminated or truncated: + obs, info = env.reset() + time.sleep(1 / env.metadata["render_fps"]) + + +if __name__ == "__main__": + main() diff --git a/cooperative_cuisine/reinforcement_learning/train_single_agent.py b/cooperative_cuisine/reinforcement_learning/train_single_agent.py index 1943a216408985db3da8b74b1edac26e12784d4c..dd48d5c1481f6d152ef34137da44c9d57bd8bec8 100644 --- a/cooperative_cuisine/reinforcement_learning/train_single_agent.py +++ b/cooperative_cuisine/reinforcement_learning/train_single_agent.py @@ -22,7 +22,6 @@ def main(cfg: DictConfig): rl_agent_checkpoints.mkdir(exist_ok=True) config = OmegaConf.to_container(cfg.model, resolve=True) debug = False - do_training = True vec_env = True models = {"A2C": A2C, "DQN": DQN, "PPO": PPO} number_envs_parallel = config["number_envs_parallel"] @@ -53,49 +52,42 @@ def main(cfg: DictConfig): model_save_path = rl_agent_checkpoints / f"overcooked_{model_class.__name__}" - if do_training: - model = model_class( - config["policy_type"], - env, - verbose=1, - tensorboard_log=f"logs/reinforcement_learning/runs/{0}", - device="cpu" - # n_steps=2048, - # n_epochs=10, + filtered_config = {k: v for k, v in config.items() if + k not in ["env_id", "policy_type", "model_type", "total_timesteps", "number_envs_parallel"] and v != 'None'} + model = model_class( + env=env, + **filtered_config + ) + if debug: + model.learn( + total_timesteps=config["total_timesteps"], + log_interval=1, + progress_bar=True, + ) + else: + checkpoint_callback = CheckpointCallback( + save_freq=50_000, + save_path="logs", + name_prefix="rl_model", + save_replay_buffer=True, + save_vecnormalize=True, + ) + wandb_callback = WandbCallback( + model_save_path=f"logs/reinforcement_learning/models/{run.id}", + verbose=0, ) - # Maybe Hydra Instatiate here to avoid hard coding the possible classes - if debug: - model.learn( - total_timesteps=config["total_timesteps"], - log_interval=1, - progress_bar=True, - ) - else: - checkpoint_callback = CheckpointCallback( - save_freq=50_000, - save_path="logs", - name_prefix="rl_model", - save_replay_buffer=True, - save_vecnormalize=True, - ) - wandb_callback = WandbCallback( - model_save_path=f"logs/reinforcement_learning/models/{run.id}", - verbose=0, - ) - - callback = CallbackList([checkpoint_callback, wandb_callback]) - model.learn( - total_timesteps=config["total_timesteps"], - callback=callback, - log_interval=1, - progress_bar=True, - ) - run.finish() - model.save(model_save_path) + callback = CallbackList([checkpoint_callback, wandb_callback]) + model.learn( + total_timesteps=config["total_timesteps"], + callback=callback, + log_interval=1, + progress_bar=True, + ) + run.finish() + model.save(model_save_path) - del model - print("LEARNING DONE.") + del model if __name__ == "__main__":