ml-agents/mlagents/trainers/ppo/trainer.py

Killed 59 out of 82 mutants

Survived

Survived mutation testing. These mutants show holes in your test suite.

Mutant 505

--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -18,7 +18,7 @@
 from mlagents.trainers.settings import TrainerSettings, PPOSettings
 
 
-logger = get_logger(__name__)
+logger = None
 
 
 class PPOTrainer(RLTrainer):

Mutant 506

--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -50,7 +50,7 @@
         self.hyperparameters: PPOSettings = cast(
             PPOSettings, self.trainer_settings.hyperparameters
         )
-        self.load = load
+        self.load = None
         self.seed = seed
         self.policy: Policy = None  # type: ignore
 

Mutant 508

--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -52,7 +52,7 @@
         )
         self.load = load
         self.seed = seed
-        self.policy: Policy = None  # type: ignore
+        self.policy: Policy = ""  # type: ignore
 
     def _process_trajectory(self, trajectory: Trajectory) -> None:
         """

Mutant 509

--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -61,7 +61,7 @@
         :param trajectory: The Trajectory tuple containing the steps to be processed.
         """
         super()._process_trajectory(trajectory)
-        agent_id = trajectory.agent_id  # All the agents should have the same ID
+        agent_id = None  # All the agents should have the same ID
 
         agent_buffer_trajectory = trajectory.to_agentbuffer()
         # Update the normalization

Mutant 515

--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -81,7 +81,7 @@
             )
 
         # Evaluate all reward functions
-        self.collected_rewards["environment"][agent_id] += np.sum(
+        self.collected_rewards["environment"][agent_id] = np.sum(
             agent_buffer_trajectory["environment_rewards"]
         )
         for name, reward_signal in self.optimizer.reward_signals.items():

Mutant 516

--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -81,7 +81,7 @@
             )
 
         # Evaluate all reward functions
-        self.collected_rewards["environment"][agent_id] += np.sum(
+        self.collected_rewards["environment"][agent_id] -= np.sum(
             agent_buffer_trajectory["environment_rewards"]
         )
         for name, reward_signal in self.optimizer.reward_signals.items():

Mutant 518

--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -90,7 +90,7 @@
             ).scaled_reward
             agent_buffer_trajectory[f"{name}_rewards"].extend(evaluate_result)
             # Report the reward signals
-            self.collected_rewards[name][agent_id] += np.sum(evaluate_result)
+            self.collected_rewards[name][agent_id] = np.sum(evaluate_result)
 
         # Compute GAE and returns
         tmp_advantages = []

Mutant 519

--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -90,7 +90,7 @@
             ).scaled_reward
             agent_buffer_trajectory[f"{name}_rewards"].extend(evaluate_result)
             # Report the reward signals
-            self.collected_rewards[name][agent_id] += np.sum(evaluate_result)
+            self.collected_rewards[name][agent_id] -= np.sum(evaluate_result)
 
         # Compute GAE and returns
         tmp_advantages = []

Mutant 524

--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -109,7 +109,7 @@
                 gamma=self.optimizer.reward_signals[name].gamma,
                 lambd=self.hyperparameters.lambd,
             )
-            local_return = local_advantage + local_value_estimates
+            local_return = local_advantage - local_value_estimates
             # This is later use as target for the different value estimates
             agent_buffer_trajectory[f"{name}_returns"].set(local_return)
             agent_buffer_trajectory[f"{name}_advantage"].set(local_advantage)

Mutant 530

--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -122,7 +122,7 @@
         )
         global_returns = list(np.mean(np.array(tmp_returns, dtype=np.float32), axis=0))
         agent_buffer_trajectory["advantages"].set(global_advantages)
-        agent_buffer_trajectory["discounted_returns"].set(global_returns)
+        agent_buffer_trajectory["XXdiscounted_returnsXX"].set(global_returns)
         # Append to update buffer
         agent_buffer_trajectory.resequence_and_append(
             self.update_buffer, training_length=self.policy.sequence_length

Mutant 534

--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -152,7 +152,7 @@
         # will need to reshape the data into a batch_size x sequence_length tensor.
         batch_size = (
             self.hyperparameters.batch_size
-            - self.hyperparameters.batch_size % self.policy.sequence_length
+            + self.hyperparameters.batch_size % self.policy.sequence_length
         )
         # Make sure there is at least one sequence
         batch_size = max(batch_size, self.policy.sequence_length)

Mutant 537

--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -158,7 +158,7 @@
         batch_size = max(batch_size, self.policy.sequence_length)
 
         n_sequences = max(
-            int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
+            int(self.hyperparameters.batch_size * self.policy.sequence_length), 1
         )
 
         advantages = self.update_buffer["advantages"].get_batch()

Mutant 538

--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -158,7 +158,7 @@
         batch_size = max(batch_size, self.policy.sequence_length)
 
         n_sequences = max(
-            int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
+            int(self.hyperparameters.batch_size / self.policy.sequence_length), 2
         )
 
         advantages = self.update_buffer["advantages"].get_batch()

Mutant 544

--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -163,7 +163,7 @@
 
         advantages = self.update_buffer["advantages"].get_batch()
         self.update_buffer["advantages"].set(
-            (advantages - advantages.mean()) / (advantages.std() + 1e-10)
+            (advantages - advantages.mean()) / (advantages.std() - 1e-10)
         )
         num_epoch = self.hyperparameters.num_epoch
         batch_update_stats = defaultdict(list)

Mutant 545

--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -163,7 +163,7 @@
 
         advantages = self.update_buffer["advantages"].get_batch()
         self.update_buffer["advantages"].set(
-            (advantages - advantages.mean()) / (advantages.std() + 1e-10)
+            (advantages - advantages.mean()) / (advantages.std() + 1.0000000001)
         )
         num_epoch = self.hyperparameters.num_epoch
         batch_update_stats = defaultdict(list)

Mutant 555

--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -186,7 +186,7 @@
             for stat, val in update_stats.items():
                 self._stats_reporter.add_stat(stat, val)
         self._clear_update_buffer()
-        return True
+        return False
 
     def create_policy(
         self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec

Mutant 557

--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -203,7 +203,7 @@
             model_path=self.artifact_path,
             load=self.load,
             condition_sigma_on_obs=False,  # Faster training for PPO
-            create_tf_graph=False,  # We will create the TF graph in the Optimizer
+            create_tf_graph=True,  # We will create the TF graph in the Optimizer
         )
 
         return policy

Mutant 561

--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -230,7 +230,7 @@
         self.policies[parsed_behavior_id.behavior_id] = policy
         self.optimizer = self.create_ppo_optimizer()
         for _reward_signal in self.optimizer.reward_signals.keys():
-            self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
+            self.collected_rewards[_reward_signal] = defaultdict(lambda: 1)
         # Needed to resume loads properly
         self.step = policy.get_current_step()
 

Mutant 565

--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -243,7 +243,7 @@
         return self.policy
 
 
-def discount_rewards(r, gamma=0.99, value_next=0.0):
+def discount_rewards(r, gamma=1.99, value_next=0.0):
     """
     Computes discounted sum of future rewards for use in updating value estimate.
     :param r: List of rewards.

Mutant 574

--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -259,7 +259,7 @@
     return discounted_r
 
 
-def get_gae(rewards, value_estimates, value_next=0.0, gamma=0.99, lambd=0.95):
+def get_gae(rewards, value_estimates, value_next=1.0, gamma=0.99, lambd=0.95):
     """
     Computes generalized advantage estimate for use in updating policy.
     :param rewards: list of rewards for time-steps t to T.

Mutant 575

--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -259,7 +259,7 @@
     return discounted_r
 
 
-def get_gae(rewards, value_estimates, value_next=0.0, gamma=0.99, lambd=0.95):
+def get_gae(rewards, value_estimates, value_next=0.0, gamma=1.99, lambd=0.95):
     """
     Computes generalized advantage estimate for use in updating policy.
     :param rewards: list of rewards for time-steps t to T.

Mutant 576

--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -259,7 +259,7 @@
     return discounted_r
 
 
-def get_gae(rewards, value_estimates, value_next=0.0, gamma=0.99, lambd=0.95):
+def get_gae(rewards, value_estimates, value_next=0.0, gamma=0.99, lambd=1.95):
     """
     Computes generalized advantage estimate for use in updating policy.
     :param rewards: list of rewards for time-steps t to T.

Mutant 582

--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -270,7 +270,7 @@
     :return: list of advantage estimates for time-steps t to T.
     """
     value_estimates = np.append(value_estimates, value_next)
-    delta_t = rewards + gamma * value_estimates[1:] - value_estimates[:-1]
+    delta_t = rewards + gamma * value_estimates[1:] - value_estimates[:+1]
     advantage = discount_rewards(r=delta_t, gamma=gamma * lambd)
     return advantage