ml-agents/mlagents/trainers/sac/trainer.py

Killed 34 out of 79 mutants

Timeouts

Mutants that made the test suite take a lot longer so the tests were killed.

Mutant 469

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -237,7 +237,7 @@
         batch_update_stats: Dict[str, list] = defaultdict(list)
         while (
             self.step - self.hyperparameters.buffer_init_steps
-        ) / self.update_steps > self.steps_per_update:
+        ) * self.update_steps > self.steps_per_update:
             logger.debug(f"Updating SAC policy at step {self.step}")
             buffer = self.update_buffer
             if self.update_buffer.num_experiences >= self.hyperparameters.batch_size:

Mutant 472

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -240,7 +240,7 @@
         ) / self.update_steps > self.steps_per_update:
             logger.debug(f"Updating SAC policy at step {self.step}")
             buffer = self.update_buffer
-            if self.update_buffer.num_experiences >= self.hyperparameters.batch_size:
+            if self.update_buffer.num_experiences > self.hyperparameters.batch_size:
                 sampled_minibatch = buffer.sample_mini_batch(
                     self.hyperparameters.batch_size,
                     sequence_length=self.policy.sequence_length,

Mutant 474

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -255,7 +255,7 @@
                 for stat_name, value in update_stats.items():
                     batch_update_stats[stat_name].append(value)
 
-                self.update_steps += 1
+                self.update_steps = 1
 
                 for stat, stat_list in batch_update_stats.items():
                     self._stats_reporter.add_stat(stat, np.mean(stat_list))

Mutant 487

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -291,7 +291,7 @@
         batch_update_stats: Dict[str, list] = defaultdict(list)
         while (
             self.step - self.hyperparameters.buffer_init_steps
-        ) / self.reward_signal_update_steps > self.reward_signal_steps_per_update:
+        ) * self.reward_signal_update_steps > self.reward_signal_steps_per_update:
             # Get minibatches for reward signal update if needed
             reward_signal_minibatches = {}
             for name, signal in self.optimizer.reward_signals.items():

Mutant 490

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -307,7 +307,7 @@
             )
             for stat_name, value in update_stats.items():
                 batch_update_stats[stat_name].append(value)
-            self.reward_signal_update_steps += 1
+            self.reward_signal_update_steps = 1
 
             for stat, stat_list in batch_update_stats.items():
                 self._stats_reporter.add_stat(stat, np.mean(stat_list))

Survived

Survived mutation testing. These mutants show holes in your test suite.

Mutant 427

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -23,7 +23,7 @@
 
 logger = get_logger(__name__)
 
-BUFFER_TRUNCATE_PERCENT = 0.8
+BUFFER_TRUNCATE_PERCENT = 1.8
 
 
 class SACTrainer(RLTrainer):

Mutant 429

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -56,7 +56,7 @@
             brain_name, trainer_settings, training, artifact_path, reward_buff_cap
         )
 
-        self.load = load
+        self.load = None
         self.seed = seed
         self.policy: Policy = None  # type: ignore
         self.optimizer: SACOptimizer = None  # type: ignore

Mutant 431

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -58,7 +58,7 @@
 
         self.load = load
         self.seed = seed
-        self.policy: Policy = None  # type: ignore
+        self.policy: Policy = ""  # type: ignore
         self.optimizer: SACOptimizer = None  # type: ignore
         self.hyperparameters: SACSettings = cast(
             SACSettings, trainer_settings.hyperparameters

Mutant 432

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -59,7 +59,7 @@
         self.load = load
         self.seed = seed
         self.policy: Policy = None  # type: ignore
-        self.optimizer: SACOptimizer = None  # type: ignore
+        self.optimizer: SACOptimizer = ""  # type: ignore
         self.hyperparameters: SACSettings = cast(
             SACSettings, trainer_settings.hyperparameters
         )

Mutant 433

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -63,7 +63,7 @@
         self.hyperparameters: SACSettings = cast(
             SACSettings, trainer_settings.hyperparameters
         )
-        self.step = 0
+        self.step = 1
 
         # Don't divide by zero
         self.update_steps = 1

Mutant 434

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -63,7 +63,7 @@
         self.hyperparameters: SACSettings = cast(
             SACSettings, trainer_settings.hyperparameters
         )
-        self.step = 0
+        self.step = None
 
         # Don't divide by zero
         self.update_steps = 1

Mutant 435

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -66,7 +66,7 @@
         self.step = 0
 
         # Don't divide by zero
-        self.update_steps = 1
+        self.update_steps = 2
         self.reward_signal_update_steps = 1
 
         self.steps_per_update = self.hyperparameters.steps_per_update

Mutant 436

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -66,7 +66,7 @@
         self.step = 0
 
         # Don't divide by zero
-        self.update_steps = 1
+        self.update_steps = None
         self.reward_signal_update_steps = 1
 
         self.steps_per_update = self.hyperparameters.steps_per_update

Mutant 437

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -67,7 +67,7 @@
 
         # Don't divide by zero
         self.update_steps = 1
-        self.reward_signal_update_steps = 1
+        self.reward_signal_update_steps = 2
 
         self.steps_per_update = self.hyperparameters.steps_per_update
         self.reward_signal_steps_per_update = (

Mutant 438

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -67,7 +67,7 @@
 
         # Don't divide by zero
         self.update_steps = 1
-        self.reward_signal_update_steps = 1
+        self.reward_signal_update_steps = None
 
         self.steps_per_update = self.hyperparameters.steps_per_update
         self.reward_signal_steps_per_update = (

Mutant 440

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -74,7 +74,7 @@
             self.hyperparameters.reward_signal_steps_per_update
         )
 
-        self.checkpoint_replay_buffer = self.hyperparameters.save_replay_buffer
+        self.checkpoint_replay_buffer = None
 
     def _checkpoint(self) -> NNCheckpoint:
         """

Mutant 445

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -124,7 +124,7 @@
         """
         super()._process_trajectory(trajectory)
         last_step = trajectory.steps[-1]
-        agent_id = trajectory.agent_id  # All the agents should have the same ID
+        agent_id = None  # All the agents should have the same ID
 
         agent_buffer_trajectory = trajectory.to_agentbuffer()
 

Mutant 449

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -133,7 +133,7 @@
             self.policy.update_normalization(agent_buffer_trajectory["vector_obs"])
 
         # Evaluate all reward functions for reporting purposes
-        self.collected_rewards["environment"][agent_id] += np.sum(
+        self.collected_rewards["environment"][agent_id] = np.sum(
             agent_buffer_trajectory["environment_rewards"]
         )
         for name, reward_signal in self.optimizer.reward_signals.items():

Mutant 450

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -133,7 +133,7 @@
             self.policy.update_normalization(agent_buffer_trajectory["vector_obs"])
 
         # Evaluate all reward functions for reporting purposes
-        self.collected_rewards["environment"][agent_id] += np.sum(
+        self.collected_rewards["environment"][agent_id] -= np.sum(
             agent_buffer_trajectory["environment_rewards"]
         )
         for name, reward_signal in self.optimizer.reward_signals.items():

Mutant 452

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -141,7 +141,7 @@
                 agent_buffer_trajectory
             ).scaled_reward
             # Report the reward signals
-            self.collected_rewards[name][agent_id] += np.sum(evaluate_result)
+            self.collected_rewards[name][agent_id] = np.sum(evaluate_result)
 
         # Get all value estimates for reporting purposes
         value_estimates, _ = self.optimizer.get_trajectory_value_estimates(

Mutant 453

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -141,7 +141,7 @@
                 agent_buffer_trajectory
             ).scaled_reward
             # Report the reward signals
-            self.collected_rewards[name][agent_id] += np.sum(evaluate_result)
+            self.collected_rewards[name][agent_id] -= np.sum(evaluate_result)
 
         # Get all value estimates for reporting purposes
         value_estimates, _ = self.optimizer.get_trajectory_value_estimates(

Mutant 454

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -178,7 +178,7 @@
         :return: A boolean corresponding to whether or not _update_policy() can be run
         """
         return (
-            self.update_buffer.num_experiences >= self.hyperparameters.batch_size
+            self.update_buffer.num_experiences > self.hyperparameters.batch_size
             and self.step >= self.hyperparameters.buffer_init_steps
         )
 

Mutant 455

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -179,7 +179,7 @@
         """
         return (
             self.update_buffer.num_experiences >= self.hyperparameters.batch_size
-            and self.step >= self.hyperparameters.buffer_init_steps
+            and self.step > self.hyperparameters.buffer_init_steps
         )
 
     @timed

Mutant 456

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -178,8 +178,7 @@
         :return: A boolean corresponding to whether or not _update_policy() can be run
         """
         return (
-            self.update_buffer.num_experiences >= self.hyperparameters.batch_size
-            and self.step >= self.hyperparameters.buffer_init_steps
+            self.update_buffer.num_experiences >= self.hyperparameters.batch_size or self.step >= self.hyperparameters.buffer_init_steps
         )
 
     @timed

Mutant 457

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -182,7 +182,6 @@
             and self.step >= self.hyperparameters.buffer_init_steps
         )
 
-    @timed
     def _update_policy(self) -> bool:
         """
         Update the SAC policy and reward signals. The reward signal generators are updated using different mini batches.

Mutant 461

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -205,7 +205,7 @@
             self.load,
             tanh_squash=True,
             reparameterize=True,
-            create_tf_graph=False,
+            create_tf_graph=True,
         )
         # Load the replay buffer if load
         if self.load and self.checkpoint_replay_buffer:

Mutant 462

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -208,7 +208,7 @@
             create_tf_graph=False,
         )
         # Load the replay buffer if load
-        if self.load and self.checkpoint_replay_buffer:
+        if self.load or self.checkpoint_replay_buffer:
             try:
                 self.load_replay_buffer()
             except (AttributeError, FileNotFoundError):

Mutant 463

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -228,7 +228,7 @@
         Uses update_buffer to update the policy. We sample the update_buffer and update
         until the steps_per_update ratio is met.
         """
-        has_updated = False
+        has_updated = True
         self.cumulative_returns_since_policy_update.clear()
         n_sequences = max(
             int(self.hyperparameters.batch_size / self.policy.sequence_length), 1

Mutant 464

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -228,7 +228,7 @@
         Uses update_buffer to update the policy. We sample the update_buffer and update
         until the steps_per_update ratio is met.
         """
-        has_updated = False
+        has_updated = None
         self.cumulative_returns_since_policy_update.clear()
         n_sequences = max(
             int(self.hyperparameters.batch_size / self.policy.sequence_length), 1

Mutant 468

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -236,7 +236,7 @@
 
         batch_update_stats: Dict[str, list] = defaultdict(list)
         while (
-            self.step - self.hyperparameters.buffer_init_steps
+            self.step + self.hyperparameters.buffer_init_steps
         ) / self.update_steps > self.steps_per_update:
             logger.debug(f"Updating SAC policy at step {self.step}")
             buffer = self.update_buffer

Mutant 470

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -237,7 +237,7 @@
         batch_update_stats: Dict[str, list] = defaultdict(list)
         while (
             self.step - self.hyperparameters.buffer_init_steps
-        ) / self.update_steps > self.steps_per_update:
+        ) / self.update_steps >= self.steps_per_update:
             logger.debug(f"Updating SAC policy at step {self.step}")
             buffer = self.update_buffer
             if self.update_buffer.num_experiences >= self.hyperparameters.batch_size:

Mutant 477

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -259,7 +259,7 @@
 
                 for stat, stat_list in batch_update_stats.items():
                     self._stats_reporter.add_stat(stat, np.mean(stat_list))
-                has_updated = True
+                has_updated = False
 
             if self.optimizer.bc_module:
                 update_stats = self.optimizer.bc_module.update()

Mutant 478

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -259,7 +259,7 @@
 
                 for stat, stat_list in batch_update_stats.items():
                     self._stats_reporter.add_stat(stat, np.mean(stat_list))
-                has_updated = True
+                has_updated = None
 
             if self.optimizer.bc_module:
                 update_stats = self.optimizer.bc_module.update()

Mutant 480

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -268,7 +268,7 @@
 
         # Truncate update buffer if neccessary. Truncate more than we need to to avoid truncating
         # a large buffer at each update.
-        if self.update_buffer.num_experiences > self.hyperparameters.buffer_size:
+        if self.update_buffer.num_experiences >= self.hyperparameters.buffer_size:
             self.update_buffer.truncate(
                 int(self.hyperparameters.buffer_size * BUFFER_TRUNCATE_PERCENT)
             )

Mutant 481

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -270,7 +270,7 @@
         # a large buffer at each update.
         if self.update_buffer.num_experiences > self.hyperparameters.buffer_size:
             self.update_buffer.truncate(
-                int(self.hyperparameters.buffer_size * BUFFER_TRUNCATE_PERCENT)
+                int(self.hyperparameters.buffer_size / BUFFER_TRUNCATE_PERCENT)
             )
         return has_updated
 

Mutant 483

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -286,7 +286,7 @@
         """
         buffer = self.update_buffer
         n_sequences = max(
-            int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
+            int(self.hyperparameters.batch_size * self.policy.sequence_length), 1
         )
         batch_update_stats: Dict[str, list] = defaultdict(list)
         while (

Mutant 484

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -286,7 +286,7 @@
         """
         buffer = self.update_buffer
         n_sequences = max(
-            int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
+            int(self.hyperparameters.batch_size / self.policy.sequence_length), 2
         )
         batch_update_stats: Dict[str, list] = defaultdict(list)
         while (

Mutant 486

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -290,7 +290,7 @@
         )
         batch_update_stats: Dict[str, list] = defaultdict(list)
         while (
-            self.step - self.hyperparameters.buffer_init_steps
+            self.step + self.hyperparameters.buffer_init_steps
         ) / self.reward_signal_update_steps > self.reward_signal_steps_per_update:
             # Get minibatches for reward signal update if needed
             reward_signal_minibatches = {}

Mutant 488

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -291,7 +291,7 @@
         batch_update_stats: Dict[str, list] = defaultdict(list)
         while (
             self.step - self.hyperparameters.buffer_init_steps
-        ) / self.reward_signal_update_steps > self.reward_signal_steps_per_update:
+        ) / self.reward_signal_update_steps >= self.reward_signal_steps_per_update:
             # Get minibatches for reward signal update if needed
             reward_signal_minibatches = {}
             for name, signal in self.optimizer.reward_signals.items():

Mutant 491

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -307,7 +307,7 @@
             )
             for stat_name, value in update_stats.items():
                 batch_update_stats[stat_name].append(value)
-            self.reward_signal_update_steps += 1
+            self.reward_signal_update_steps -= 1
 
             for stat, stat_list in batch_update_stats.items():
                 self._stats_reporter.add_stat(stat, np.mean(stat_list))

Mutant 492

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -307,7 +307,7 @@
             )
             for stat_name, value in update_stats.items():
                 batch_update_stats[stat_name].append(value)
-            self.reward_signal_update_steps += 1
+            self.reward_signal_update_steps += 2
 
             for stat, stat_list in batch_update_stats.items():
                 self._stats_reporter.add_stat(stat, np.mean(stat_list))

Mutant 496

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -332,7 +332,7 @@
         self.policies[parsed_behavior_id.behavior_id] = policy
         self.optimizer = self.create_sac_optimizer()
         for _reward_signal in self.optimizer.reward_signals.keys():
-            self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
+            self.collected_rewards[_reward_signal] = defaultdict(lambda: 1)
         # Needed to resume loads properly
         self.step = policy.get_current_step()
         # Assume steps were updated at the correct ratio before

Mutant 501

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -336,7 +336,7 @@
         # Needed to resume loads properly
         self.step = policy.get_current_step()
         # Assume steps were updated at the correct ratio before
-        self.update_steps = int(max(1, self.step / self.steps_per_update))
+        self.update_steps = int(max(1, self.step * self.steps_per_update))
         self.reward_signal_update_steps = int(
             max(1, self.step / self.reward_signal_steps_per_update)
         )

Mutant 503

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -338,7 +338,7 @@
         # Assume steps were updated at the correct ratio before
         self.update_steps = int(max(1, self.step / self.steps_per_update))
         self.reward_signal_update_steps = int(
-            max(1, self.step / self.reward_signal_steps_per_update)
+            max(2, self.step / self.reward_signal_steps_per_update)
         )
 
     def get_policy(self, name_behavior_id: str) -> Policy:

Mutant 504

--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -338,7 +338,7 @@
         # Assume steps were updated at the correct ratio before
         self.update_steps = int(max(1, self.step / self.steps_per_update))
         self.reward_signal_update_steps = int(
-            max(1, self.step / self.reward_signal_steps_per_update)
+            max(1, self.step * self.reward_signal_steps_per_update)
         )
 
     def get_policy(self, name_behavior_id: str) -> Policy: