Merge branch '142-restructure-reinforcement-learning-files-2' into 'dev'

Integrated overcooked-ai into cooperative cuisine. See merge request !103

Merge branch '142-restructure-reinforcement-learning-files-2' into 'dev'
642d3c1c · Florian Schröder · ffa38868 · e04f5020 · 642d3c1c · 642d3c1c
Commit 642d3c1c authored 7 months ago by Florian Schröder
--- a/cooperative_cuisine/reinforcement_learning/gym_env.py
+++ b/cooperative_cuisine/reinforcement_learning/gym_env.py
@@ -36,6 +36,18 @@ class SimpleActionSpace(Enum):


 def get_env_action(player_id, simple_action, duration):
+
+    """
+
+    Args:
+        player_id: id of the player
+        simple_action: an action in the form of a SimpleActionSpace
+        duration: for how long an action should be conducted
+
+    Returns: a concrete action
+
+    """
+
    match simple_action:
        case SimpleActionSpace.Up:
            return Action(
@@ -82,9 +94,6 @@ def get_env_action(player_id, simple_action, duration):
            )


-layout_path: Path = ROOT_DIR / "reinforcement_learning" / "rl_small.layout"
-with open(layout_path, "r") as file:
-    layout = file.read()
 with open(ROOT_DIR / "pygame_2d_vis" / "visualization.yaml", "r") as file:
    visualization_config = yaml.safe_load(file)

@@ -94,6 +103,12 @@ visualizer.set_grid_size(40)


 def shuffle_counters(env):
+
+    """
+    Shuffles the counters of an environment
+    Args:
+        env: the environment object
+    """
    sample_counter = []
    other_counters = []
    for counter in env.counters:
@@ -110,11 +125,10 @@ def shuffle_counters(env):


 class StateToObservationConverter:
-    '''
-
-
+    """
+    Abstract definition of a class that gets and environment and outputs a state representation for rl
+    """

-    '''
    @abstractmethod
    def setup(self, env):
        ...
@@ -132,25 +146,31 @@ class EnvGymWrapper(Env):
    metadata = {"render_modes": ["human", "rgb_array"], "render_fps": 10}

    def __init__(self, config):
+        """
+        Initializes all necessary variables
+
+        Args:
+            config:gets the rl and environment configuration from hydra
+        """
        super().__init__()

        self.randomize_counter_placement = False
-        self.use_rgb_obs = True  # if False uses simple vectorized state
+        self.use_rgb_obs = False  # if False uses simple vectorized state
        self.full_vector_state = True
        config_env = OmegaConf.to_container(config.environment, resolve=True)
        config_item_info = OmegaConf.to_container(config.item_info, resolve=True)
-        order_generator = config.additional_configs.order_generator
-        custom_config_path = ROOT_DIR / "reinforcement_learning" / "config" / order_generator
-        with open(custom_config_path, "r") as file:
-            custom_classes = yaml.load(file, Loader=yaml.Loader)
-        for key, value in config_env['hook_callbacks'].items():
-            value['callback_class'] = custom_classes['callback_class']
-        config_env["orders"]["order_gen_class"] = custom_classes['order_gen_class']
+        for val in config_env['hook_callbacks']:
+            config_env['hook_callbacks'][val]["callback_class"] = instantiate(config_env['hook_callbacks'][val]["callback_class"])
+        config_env["orders"]["order_gen_class"] = instantiate(config_env["orders"]["order_generator"])
        self.config_env = config_env
        self.config_item_info = config_item_info
+        layout_file = config_env["layout_name"]
+        layout_path: Path = ROOT_DIR / layout_file
+        with open(layout_path, "r") as file:
+            self.layout = file.read()
        self.env: Environment = Environment(
            env_config=deepcopy(config_env),
-            layout_config=layout,
+            layout_config=self.layout,
            item_info=deepcopy(config_item_info),
            as_files=False,
            yaml_already_loaded=True,
@@ -197,6 +217,10 @@ class EnvGymWrapper(Env):
        self.prev_score = 0

    def step(self, action):
+        """
+        takes one step in the environment and returns the observation, reward, info whether terminated, truncated
+        and additional information
+        """
        # this is simply a work-around to enable no action which is necessary for the play_gym.py
        if action == 8:
            observation = self.get_observation()
@@ -231,10 +255,14 @@ class EnvGymWrapper(Env):
        return observation, reward, terminated, truncated, info

    def reset(self, seed=None, options=None):
-        del visualizer.surface_cache_dict[self.env.env_name]
+        """
+        Resets the environment according to the configs
+        """
+        if self.env.env_name in visualizer.surface_cache_dict:
+            del visualizer.surface_cache_dict[self.env.env_name]
        self.env: Environment = Environment(
            env_config=deepcopy(self.config_env),
-            layout_config=layout,
+            layout_config=self.layout,
            item_info=deepcopy(self.config_item_info),
            as_files=False,
            env_name=uuid.uuid4().hex,

--- a/cooperative_cuisine/reinforcement_learning/obs_converter/base_converter.py
+++ b/cooperative_cuisine/reinforcement_learning/obs_converter/base_converter.py
@@ -8,13 +8,20 @@ from cooperative_cuisine.reinforcement_learning.gym_env import StateToObservatio


 class BaseStateConverter(StateToObservationConverter):
+    """
+    Converts an environment state to an Encoding where each counter/item has its unique value
+    """
+
    def __init__(self):
        self.onehot = False
+        self.grid_height: int | None = None
+        self.grid_width: int | None = None
        self.counter_list = [
            "Empty",
            "Counter",
            "PlateDispenser",
            "TomatoDispenser",
+            "OnionDispenser",
            "ServingWindow",
            "PlateReturn",
            "Trashcan",
@@ -26,28 +33,48 @@ class BaseStateConverter(StateToObservationConverter):
        self.item_list = [
            "None",
            "Pot",
-            "PotOne",
-            "PotTwo",
-            "PotThree",
-            "PotDone",
+            "PotOne_Tomato",
+            "PotTwo_Tomato",
+            "PotThree_Tomato",
+            "PotDone_Tomato",
+            "PotOne_Onion",
+            "PotTwo_Onion",
+            "PotThree_Onion",
+            "PotDone_Onion",
            "Tomato",
+            "Onion",
            "ChoppedTomato",
            "Plate",
            "PlateTomatoSoup",
+            "PlateOnionSoup",
            "PlateSalad",
            "Lettuce",
            "PlateChoppedTomato",
            "PlateChoppedLettuce",
            "ChoppedLettuce",
+            "ChoppedOnion",
        ]
        self.player = "0"

    def setup(self, env):
+        """
+        Constructor setting basic variables as attributes.
+
+        """
        self.grid_width, self.grid_height = int(env.kitchen_width), int(
            env.kitchen_height)

    def convert_state_to_observation(self, env) -> np.ndarray:

+        """
+        Convert the environment into an onehot encoding
+        Args:
+            env: The environment object used
+
+        Returns: An encoding for the environment state that is not onehot
+
+        """
+
        grid_base_array = np.zeros(
            (
                self.grid_width,
@@ -115,18 +142,31 @@ class BaseStateConverter(StateToObservationConverter):
            if item.name == "Pot":
                if len(item.content_list) > 0:
                    if item.content_list[0].name == "TomatoSoup":
-                        item_name = "PotDone"
+                        item_name = "PotDone_Tomato"
+                    if item.content_list[0].name == "OnionSoup":
+                        item_name = "PotDone_Onion"
                    elif len(item.content_list) == 1:
-                        item_name = "PotOne"
+                        if item.content_list[0].name == "Tomato":
+                            item_name = "PotOne_Tomato"
+                        if item.content_list[0].name == "Onion":
+                            item_name = "PotOne_Onion"
                    elif len(item.content_list) == 2:
-                        item_name = "PotTwo"
+                        if item.content_list[0].name == "Tomato":
+                            item_name = "PotTwo_Tomato"
+                        if item.content_list[0].name == "Onion":
+                            item_name = "PotTwo_Onion"
                    elif len(item.content_list) == 3:
-                        item_name = "PotThree"
+                        if item.content_list[0].name == "Tomato":
+                            item_name = "PotThree_Tomato"
+                        if item.content_list[0].name == "Onion":
+                            item_name = "PotThree_Onion"
            if "Plate" in item.name:
                content_list = [i.name for i in item.content_list]
                match content_list:
                    case ["TomatoSoup"]:
                        item_name = "PlateTomatoSoup"
+                    case ["OnionSoup"]:
+                        item_name = "PlateOnionSoup"
                    case ["ChoppedTomato"]:
                        item_name = "PlateChoppedTomato"
                    case ["ChoppedLettuce"]:

--- a/cooperative_cuisine/reinforcement_learning/obs_converter/base_converter_onehot.py
+++ b/cooperative_cuisine/reinforcement_learning/obs_converter/base_converter_onehot.py
@@ -8,15 +8,24 @@ from cooperative_cuisine.reinforcement_learning.gym_env import StateToObservatio


 class BaseStateConverterOnehot(StateToObservationConverter):
+    """
+    Converts an environment state to an Onehot Encoding
+    """
+
    def __init__(self):
+        """
+        Constructor setting basic variables as attributes.
+
+        """
        self.onehot = True
-        self.grid_height = None
-        self.grid_width = None
+        self.grid_height: int | None = None
+        self.grid_width: int | None = None
        self.counter_list = [
            "Empty",
            "Counter",
            "PlateDispenser",
            "TomatoDispenser",
+            "OnionDispenser",
            "ServingWindow",
            "PlateReturn",
            "Trashcan",
@@ -28,27 +37,51 @@ class BaseStateConverterOnehot(StateToObservationConverter):
        self.item_list = [
            "None",
            "Pot",
-            "PotOne",
-            "PotTwo",
-            "PotThree",
-            "PotDone",
+            "PotOne_Tomato",
+            "PotTwo_Tomato",
+            "PotThree_Tomato",
+            "PotDone_Tomato",
+            "PotOne_Onion",
+            "PotTwo_Onion",
+            "PotThree_Onion",
+            "PotDone_Onion",
            "Tomato",
+            "Onion",
            "ChoppedTomato",
            "Plate",
            "PlateTomatoSoup",
+            "PlateOnionSoup",
            "PlateSalad",
            "Lettuce",
            "PlateChoppedTomato",
            "PlateChoppedLettuce",
            "ChoppedLettuce",
+            "ChoppedOnion",
        ]
        self.player = "0"

    def setup(self, env):
+        """
+        Set the grid width and height according to the present environment
+
+        Args:
+            env: The environment object used
+        """
+
        self.grid_width, self.grid_height = int(env.kitchen_width), int(
            env.kitchen_height)

    def convert_state_to_observation(self, env) -> np.ndarray:
+
+        """
+        Convert the environment into an onehot encoding
+        Args:
+            env: The environment object used
+
+        Returns: An onehot encoding for the environment state
+
+        """
+
        grid_base_array = np.zeros(
            (
                self.grid_width,
@@ -92,7 +125,7 @@ class BaseStateConverterOnehot(StateToObservationConverter):
        player_item_one_hot = self.vectorize_item(
            env.players[self.player].holding, self.item_list
        )
-
+        # simply concat all entities to one large vector
        final = np.concatenate(
            (
                counters.flatten(),
@@ -116,22 +149,36 @@ class BaseStateConverterOnehot(StateToObservationConverter):
        else:
            item_name = item.name

+        # different naming convention for the different pots to include the progress. New implementation should be found here
        if isinstance(item, CookingEquipment):
            if item.name == "Pot":
                if len(item.content_list) > 0:
                    if item.content_list[0].name == "TomatoSoup":
-                        item_name = "PotDone"
+                        item_name = "PotDone_Tomato"
+                    if item.content_list[0].name == "OnionSoup":
+                        item_name = "PotDone_Onion"
                    elif len(item.content_list) == 1:
-                        item_name = "PotOne"
+                        if item.content_list[0].name == "Tomato":
+                            item_name = "PotOne_Tomato"
+                        if item.content_list[0].name == "Onion":
+                            item_name = "PotOne_Onion"
                    elif len(item.content_list) == 2:
-                        item_name = "PotTwo"
+                        if item.content_list[0].name == "Tomato":
+                            item_name = "PotTwo_Tomato"
+                        if item.content_list[0].name == "Onion":
+                            item_name = "PotTwo_Onion"
                    elif len(item.content_list) == 3:
-                        item_name = "PotThree"
+                        if item.content_list[0].name == "Tomato":
+                            item_name = "PotThree_Tomato"
+                        if item.content_list[0].name == "Onion":
+                            item_name = "PotThree_Onion"
            if "Plate" in item.name:
                content_list = [i.name for i in item.content_list]
                match content_list:
                    case ["TomatoSoup"]:
                        item_name = "PlateTomatoSoup"
+                    case ["OnionSoup"]:
+                        item_name = "PlateOnionSoup"
                    case ["ChoppedTomato"]:
                        item_name = "PlateChoppedTomato"
                    case ["ChoppedLettuce"]:

--- a/cooperative_cuisine/reinforcement_learning/overcooked_ai.md
+++ b/cooperative_cuisine/reinforcement_learning/overcooked_ai.md
+# Overcooked-AI and Cooperative Cuisine
+
+## Use the overcooked-AI levels and configs in cooperative cuisine
+All the layouts from overcooked-AI can be used within cooperative cuisine. Dedicated configs are defined and can be loaded via hydra.
+The overcooked-ai_environment_config.yaml must be chosen as environment config. Under layout_name any layout from overcooked-AI can be defined.
+Additionally, the item_config must be item_info_overcooked-ai.yaml. 
+With those chosen configs the layouts and rewards from overcooked-AI are used.
+
+## How is the connection between Overcooked-AI and cooperative cuisine defined? 
+Cooperative Cuisine is highly modular due to the usage of hydra as config manager. 
+Therefore, the parameters used for overcooked-AI are simply used in the dedicated config file. 
+The layout format is different, which is why a mapping is defined which converts the overcooked-AI layout into the cooperative cuisine layout. 
+The layout file has to be present in cooperative_cuisine/reinforcement_learning/layouts/overcooked_ai_layouts.
--- a/cooperative_cuisine/reinforcement_learning/play_gym.py
+++ b/cooperative_cuisine/reinforcement_learning/play_gym.py
@@ -7,6 +7,9 @@ from gym_env import EnvGymWrapper, SimpleActionSpace

 @hydra.main(version_base="1.3", config_path="config", config_name="rl_config")
 def main(cfg: DictConfig):
+    """
+    Enables steering the agent in the environment used for rl.
+    """
    env = EnvGymWrapper(cfg)
    env.render_mode = "rgb_array"
    play(env, keys_to_action={"a": 2, "d": 3, "w": 0, "s": 1, " ": 4, "k": 5}, noop=8)

--- a/cooperative_cuisine/reinforcement_learning/rl_small.layout
+++ b/cooperative_cuisine/reinforcement_learning/rl_small.layout
-##X#
-T__L
-U__P
-#C$#
--- a/cooperative_cuisine/reinforcement_learning/run_single_agent.py
+++ b/cooperative_cuisine/reinforcement_learning/run_single_agent.py
 import time
+from pathlib import Path

 import cv2
 from stable_baselines3 import DQN, A2C, PPO
@@ -11,15 +12,17 @@ from hydra.utils import instantiate, call

 @hydra.main(version_base="1.3", config_path="config", config_name="rl_config")
 def main(cfg: DictConfig):
+    """
+    loads the trained model and enables the user to see an example with the according rewards.
+    """
    additional_config = OmegaConf.to_container(cfg.additional_configs, resolve=True)
-    model_save_path = additional_config["log_path"] + "/" + additional_config["checkpoint_path"] + "/" + \
-                      additional_config["project_name"] + "_" + OmegaConf.to_container(cfg.model, resolve=True)[
-                          "model_name"]
+    model_save_path = Path(additional_config["log_path"]) / Path(additional_config["checkpoint_path"]) / Path(
+        additional_config["project_name"] + "_" + OmegaConf.to_container(cfg.model, resolve=True)["model_name"])
    model_class = call(cfg.model.model_type_inference)
    model = model_class(model_save_path)
    env = EnvGymWrapper(cfg)

-    #check_env(env)
+    # check_env(env)
    obs, info = env.reset()
    print(obs)
    while True:

--- a/cooperative_cuisine/reinforcement_learning/train_single_agent.py
+++ b/cooperative_cuisine/reinforcement_learning/train_single_agent.py
 from pathlib import Path
+from typing import Any

 import wandb
 from omegaconf import DictConfig, OmegaConf
@@ -17,13 +18,17 @@ from hydra.utils import instantiate

 @hydra.main(version_base="1.3", config_path="config", config_name="rl_config")
 def main(cfg: DictConfig):
-    additional_configs = OmegaConf.to_container(cfg.additional_configs, resolve=True)
-    rl_logs = Path(additional_configs["log_path"])
+    """
+    trains an agent from scratch and saves the model to the specified path
+    All configs are managed with hydra.
+    """
+    additional_configs: dict[str, Any] = OmegaConf.to_container(cfg.additional_configs, resolve=True)
+    rl_logs: Path = Path(additional_configs["log_path"])
    rl_logs.mkdir(exist_ok=True)
-    rl_agent_checkpoints = rl_logs / Path(additional_configs["checkpoint_path"])
+    rl_agent_checkpoints: Path = rl_logs / Path(additional_configs["checkpoint_path"])
    rl_agent_checkpoints.mkdir(exist_ok=True)
-    config = OmegaConf.to_container(cfg.model, resolve=True)
-    debug = additional_configs["debug_mode"]
+    config: dict[str, Any] = OmegaConf.to_container(cfg.model, resolve=True)
+    debug: bool = additional_configs["debug_mode"]
    vec_env = additional_configs["vec_env"]
    number_envs_parallel = config["number_envs_parallel"]
    model_class = instantiate(cfg.model.model_type)