env_id: "overcooked" policy: "MlpPolicy" model_name: "PPO" model_type: _partial_: true _target_: stable_baselines3.PPO model_type_inference: _partial_: true _target_: stable_baselines3.PPO.load total_timesteps: 3_000_000 # hendric sagt eher so 300_000_000 schritte number_envs_parallel: 16 learning_rate: 0.0003 n_steps: 2048 batch_size: 16 n_epochs: 10 gamma: 0.99 gae_lambda: 0.95 clip_range: 0.2 clip_range_vf: None normalize_advantage: True ent_coef: 0.0 vf_coef: 0.5 max_grad_norm: 0.5 use_sde: False sde_sample_freq: -1 rollout_buffer_class: None rollout_buffer_kwargs: None target_kl: None stats_window_size: 100 tensorboard_log: f"logs/reinforcement_learning/runs/{0}" policy_kwargs: None verbose: 0 seed: None device: "auto" _init_setup_model: True