ml-agents/mlagents/trainers/sac/optimizer.py

Killed 170 out of 253 mutants

Survived

Survived mutation testing. These mutants show holes in your test suite.

Mutant 252

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -12,7 +12,7 @@
 from mlagents_envs.timers import timed
 from mlagents.trainers.settings import TrainerSettings, SACSettings
 
-EPSILON = 1e-6  # Small value to avoid divide by zero
+EPSILON = 1.000001  # Small value to avoid divide by zero
 
 logger = get_logger(__name__)
 

Mutant 253

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -12,7 +12,7 @@
 from mlagents_envs.timers import timed
 from mlagents.trainers.settings import TrainerSettings, SACSettings
 
-EPSILON = 1e-6  # Small value to avoid divide by zero
+EPSILON = None  # Small value to avoid divide by zero
 
 logger = get_logger(__name__)
 

Mutant 255

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -16,7 +16,7 @@
 
 logger = get_logger(__name__)
 
-POLICY_SCOPE = ""
+POLICY_SCOPE = "XXXX"
 TARGET_SCOPE = "target_network"
 
 

Mutant 256

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -16,7 +16,7 @@
 
 logger = get_logger(__name__)
 
-POLICY_SCOPE = ""
+POLICY_SCOPE = None
 TARGET_SCOPE = "target_network"
 
 

Mutant 257

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -17,7 +17,7 @@
 logger = get_logger(__name__)
 
 POLICY_SCOPE = ""
-TARGET_SCOPE = "target_network"
+TARGET_SCOPE = "XXtarget_networkXX"
 
 
 class SACOptimizer(TFOptimizer):

Mutant 258

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -17,7 +17,7 @@
 logger = get_logger(__name__)
 
 POLICY_SCOPE = ""
-TARGET_SCOPE = "target_network"
+TARGET_SCOPE = None
 
 
 class SACOptimizer(TFOptimizer):

Mutant 263

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -51,7 +51,7 @@
                 lr = hyperparameters.learning_rate
                 lr_schedule = hyperparameters.learning_rate_schedule
                 max_step = trainer_params.max_steps
-                self.tau = hyperparameters.tau
+                self.tau = None
                 self.init_entcoef = hyperparameters.init_entcoef
 
                 self.policy = policy

Mutant 275

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -68,7 +68,7 @@
                 self.discrete_target_entropy_scale = (
                     0.2  # Roughly equal to e-greedy 0.05
                 )
-                self.continuous_target_entropy_scale = 1.0
+                self.continuous_target_entropy_scale = 2.0
 
                 stream_names = list(self.reward_signals.keys())
                 # Use to reduce "survivor bonus" when using Curiosity or GAIL.

Mutant 278

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -76,7 +76,7 @@
                     _val.gamma for _val in trainer_params.reward_signals.values()
                 ]
                 self.use_dones_in_backup = {
-                    name: tf.Variable(1.0) for name in stream_names
+                    name: tf.Variable(2.0) for name in stream_names
                 }
                 self.disable_use_dones = {
                     name: self.use_dones_in_backup[name].assign(0.0)

Mutant 279

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -79,7 +79,7 @@
                     name: tf.Variable(1.0) for name in stream_names
                 }
                 self.disable_use_dones = {
-                    name: self.use_dones_in_backup[name].assign(0.0)
+                    name: self.use_dones_in_backup[name].assign(1.0)
                     for name in stream_names
                 }
 

Mutant 280

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -83,7 +83,7 @@
                     for name in stream_names
                 }
 
-                if num_layers < 1:
+                if num_layers <= 1:
                     num_layers = 1
 
                 self.target_init_op: List[tf.Tensor] = []

Mutant 281

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -83,7 +83,7 @@
                     for name in stream_names
                 }
 
-                if num_layers < 1:
+                if num_layers < 2:
                     num_layers = 1
 
                 self.target_init_op: List[tf.Tensor] = []

Mutant 282

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -86,7 +86,7 @@
                 if num_layers < 1:
                     num_layers = 1
 
-                self.target_init_op: List[tf.Tensor] = []
+                self.target_init_op: List[tf.Tensor] = None
                 self.target_update_op: List[tf.Tensor] = []
                 self.update_batch_policy: Optional[tf.Operation] = None
                 self.update_batch_value: Optional[tf.Operation] = None

Mutant 283

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -87,7 +87,7 @@
                     num_layers = 1
 
                 self.target_init_op: List[tf.Tensor] = []
-                self.target_update_op: List[tf.Tensor] = []
+                self.target_update_op: List[tf.Tensor] = None
                 self.update_batch_policy: Optional[tf.Operation] = None
                 self.update_batch_value: Optional[tf.Operation] = None
                 self.update_batch_entropy: Optional[tf.Operation] = None

Mutant 284

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -88,7 +88,7 @@
 
                 self.target_init_op: List[tf.Tensor] = []
                 self.target_update_op: List[tf.Tensor] = []
-                self.update_batch_policy: Optional[tf.Operation] = None
+                self.update_batch_policy: Optional[tf.Operation] = ""
                 self.update_batch_value: Optional[tf.Operation] = None
                 self.update_batch_entropy: Optional[tf.Operation] = None
 

Mutant 285

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -89,7 +89,7 @@
                 self.target_init_op: List[tf.Tensor] = []
                 self.target_update_op: List[tf.Tensor] = []
                 self.update_batch_policy: Optional[tf.Operation] = None
-                self.update_batch_value: Optional[tf.Operation] = None
+                self.update_batch_value: Optional[tf.Operation] = ""
                 self.update_batch_entropy: Optional[tf.Operation] = None
 
                 self.policy_network = SACPolicyNetwork(

Mutant 286

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -90,7 +90,7 @@
                 self.target_update_op: List[tf.Tensor] = []
                 self.update_batch_policy: Optional[tf.Operation] = None
                 self.update_batch_value: Optional[tf.Operation] = None
-                self.update_batch_entropy: Optional[tf.Operation] = None
+                self.update_batch_entropy: Optional[tf.Operation] = ""
 
                 self.policy_network = SACPolicyNetwork(
                     policy=self.policy,

Mutant 290

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -120,7 +120,7 @@
                     lr,
                     self.policy.global_step,
                     int(max_step),
-                    min_value=1e-10,
+                    min_value=1.0000000001,
                 )
                 self._create_losses(
                     self.policy_network.q1_heads,

Mutant 292

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -149,7 +149,7 @@
                 self.policy.initialize_or_load()
 
         self.stats_name_to_update_name = {
-            "Losses/Value Loss": "value_loss",
+            "XXLosses/Value LossXX": "value_loss",
             "Losses/Policy Loss": "policy_loss",
             "Losses/Q1 Loss": "q1_loss",
             "Losses/Q2 Loss": "q2_loss",

Mutant 294

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -150,7 +150,7 @@
 
         self.stats_name_to_update_name = {
             "Losses/Value Loss": "value_loss",
-            "Losses/Policy Loss": "policy_loss",
+            "XXLosses/Policy LossXX": "policy_loss",
             "Losses/Q1 Loss": "q1_loss",
             "Losses/Q2 Loss": "q2_loss",
             "Policy/Entropy Coeff": "entropy_coef",

Mutant 296

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -151,7 +151,7 @@
         self.stats_name_to_update_name = {
             "Losses/Value Loss": "value_loss",
             "Losses/Policy Loss": "policy_loss",
-            "Losses/Q1 Loss": "q1_loss",
+            "XXLosses/Q1 LossXX": "q1_loss",
             "Losses/Q2 Loss": "q2_loss",
             "Policy/Entropy Coeff": "entropy_coef",
             "Policy/Learning Rate": "learning_rate",

Mutant 298

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -152,7 +152,7 @@
             "Losses/Value Loss": "value_loss",
             "Losses/Policy Loss": "policy_loss",
             "Losses/Q1 Loss": "q1_loss",
-            "Losses/Q2 Loss": "q2_loss",
+            "XXLosses/Q2 LossXX": "q2_loss",
             "Policy/Entropy Coeff": "entropy_coef",
             "Policy/Learning Rate": "learning_rate",
         }

Mutant 300

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -153,7 +153,7 @@
             "Losses/Policy Loss": "policy_loss",
             "Losses/Q1 Loss": "q1_loss",
             "Losses/Q2 Loss": "q2_loss",
-            "Policy/Entropy Coeff": "entropy_coef",
+            "XXPolicy/Entropy CoeffXX": "entropy_coef",
             "Policy/Learning Rate": "learning_rate",
         }
 

Mutant 302

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -154,7 +154,7 @@
             "Losses/Q1 Loss": "q1_loss",
             "Losses/Q2 Loss": "q2_loss",
             "Policy/Entropy Coeff": "entropy_coef",
-            "Policy/Learning Rate": "learning_rate",
+            "XXPolicy/Learning RateXX": "learning_rate",
         }
 
         self.update_dict = {

Mutant 309

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -163,7 +163,7 @@
             "q1_loss": self.q1_loss,
             "q2_loss": self.q2_loss,
             "entropy_coef": self.ent_coef,
-            "update_batch": self.update_batch_policy,
+            "XXupdate_batchXX": self.update_batch_policy,
             "update_value": self.update_batch_value,
             "update_entropy": self.update_batch_entropy,
             "learning_rate": self.learning_rate,

Mutant 310

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -164,7 +164,7 @@
             "q2_loss": self.q2_loss,
             "entropy_coef": self.ent_coef,
             "update_batch": self.update_batch_policy,
-            "update_value": self.update_batch_value,
+            "XXupdate_valueXX": self.update_batch_value,
             "update_entropy": self.update_batch_entropy,
             "learning_rate": self.learning_rate,
         }

Mutant 311

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -165,7 +165,7 @@
             "entropy_coef": self.ent_coef,
             "update_batch": self.update_batch_policy,
             "update_value": self.update_batch_value,
-            "update_entropy": self.update_batch_entropy,
+            "XXupdate_entropyXX": self.update_batch_entropy,
             "learning_rate": self.learning_rate,
         }
 

Mutant 313

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -174,7 +174,7 @@
         Assign the higher-level SACModel's inputs and outputs to those of its policy or
         target network.
         """
-        self.vector_in = self.policy.vector_in
+        self.vector_in = None
         self.visual_in = self.policy.visual_in
         self.next_vector_in = self.target_network.vector_in
         self.next_visual_in = self.target_network.visual_in

Mutant 314

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -175,7 +175,7 @@
         target network.
         """
         self.vector_in = self.policy.vector_in
-        self.visual_in = self.policy.visual_in
+        self.visual_in = None
         self.next_vector_in = self.target_network.vector_in
         self.next_visual_in = self.target_network.visual_in
         self.sequence_length_ph = self.policy.sequence_length_ph

Mutant 317

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -178,7 +178,7 @@
         self.visual_in = self.policy.visual_in
         self.next_vector_in = self.target_network.vector_in
         self.next_visual_in = self.target_network.visual_in
-        self.sequence_length_ph = self.policy.sequence_length_ph
+        self.sequence_length_ph = None
         self.next_sequence_length_ph = self.target_network.sequence_length_ph
         if not self.policy.use_continuous_act:
             self.action_masks = self.policy_network.action_masks

Mutant 319

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -180,7 +180,7 @@
         self.next_visual_in = self.target_network.visual_in
         self.sequence_length_ph = self.policy.sequence_length_ph
         self.next_sequence_length_ph = self.target_network.sequence_length_ph
-        if not self.policy.use_continuous_act:
+        if  self.policy.use_continuous_act:
             self.action_masks = self.policy_network.action_masks
         else:
             self.output_pre = self.policy_network.output_pre

Mutant 320

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -181,7 +181,7 @@
         self.sequence_length_ph = self.policy.sequence_length_ph
         self.next_sequence_length_ph = self.target_network.sequence_length_ph
         if not self.policy.use_continuous_act:
-            self.action_masks = self.policy_network.action_masks
+            self.action_masks = None
         else:
             self.output_pre = self.policy_network.output_pre
 

Mutant 321

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -183,7 +183,7 @@
         if not self.policy.use_continuous_act:
             self.action_masks = self.policy_network.action_masks
         else:
-            self.output_pre = self.policy_network.output_pre
+            self.output_pre = None
 
         # Don't use value estimate during inference.
         self.value = tf.identity(

Mutant 322

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -187,7 +187,7 @@
 
         # Don't use value estimate during inference.
         self.value = tf.identity(
-            self.policy_network.value, name="value_estimate_unused"
+            self.policy_network.value, name="XXvalue_estimate_unusedXX"
         )
         self.value_heads = self.policy_network.value_heads
         self.dones_holder = tf.placeholder(

Mutant 324

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -191,7 +191,7 @@
         )
         self.value_heads = self.policy_network.value_heads
         self.dones_holder = tf.placeholder(
-            shape=[None], dtype=tf.float32, name="dones_holder"
+            shape=[None], dtype=tf.float32, name="XXdones_holderXX"
         )
 
         if self.policy.use_recurrent:

Mutant 327

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -197,7 +197,7 @@
         if self.policy.use_recurrent:
             self.memory_in = self.policy_network.memory_in
             self.memory_out = self.policy_network.memory_out
-            if not self.policy.use_continuous_act:
+            if  self.policy.use_continuous_act:
                 self.prev_action = self.policy_network.prev_action
             self.next_memory_in = self.target_network.memory_in
 

Mutant 328

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -198,7 +198,7 @@
             self.memory_in = self.policy_network.memory_in
             self.memory_out = self.policy_network.memory_out
             if not self.policy.use_continuous_act:
-                self.prev_action = self.policy_network.prev_action
+                self.prev_action = None
             self.next_memory_in = self.target_network.memory_in
 
     def _create_losses(

Mutant 329

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -199,7 +199,7 @@
             self.memory_out = self.policy_network.memory_out
             if not self.policy.use_continuous_act:
                 self.prev_action = self.policy_network.prev_action
-            self.next_memory_in = self.target_network.memory_in
+            self.next_memory_in = None
 
     def _create_losses(
         self,

Mutant 330

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -208,7 +208,7 @@
         lr: tf.Tensor,
         max_step: int,
         stream_names: List[str],
-        discrete: bool = False,
+        discrete: bool = True,
     ) -> None:
         """
         Creates training-specific Tensorflow ops for SAC models.

Mutant 331

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -222,7 +222,7 @@
 
         if discrete:
             self.target_entropy = [
-                self.discrete_target_entropy_scale * np.log(i).astype(np.float32)
+                self.discrete_target_entropy_scale / np.log(i).astype(np.float32)
                 for i in self.act_size
             ]
             discrete_action_probs = tf.exp(self.policy.all_log_probs)

Mutant 336

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -229,7 +229,7 @@
             per_action_entropy = discrete_action_probs * self.policy.all_log_probs
         else:
             self.target_entropy = (
-                -1
+                -2
                 * self.continuous_target_entropy_scale
                 * np.prod(self.act_size[0]).astype(np.float32)
             )

Mutant 337

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -230,7 +230,7 @@
         else:
             self.target_entropy = (
                 -1
-                * self.continuous_target_entropy_scale
+                / self.continuous_target_entropy_scale
                 * np.prod(self.act_size[0]).astype(np.float32)
             )
 

Mutant 338

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -231,7 +231,7 @@
             self.target_entropy = (
                 -1
                 * self.continuous_target_entropy_scale
-                * np.prod(self.act_size[0]).astype(np.float32)
+                / np.prod(self.act_size[0]).astype(np.float32)
             )
 
         self.rewards_holders = {}

Mutant 356

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -278,7 +278,7 @@
         q1_losses = []
         q2_losses = []
         # Multiple q losses per stream
-        expanded_dones = tf.expand_dims(self.dones_holder, axis=-1)
+        expanded_dones = tf.expand_dims(self.dones_holder, axis=+1)
         for i, name in enumerate(stream_names):
             _expanded_rewards = tf.expand_dims(self.rewards_holders[name], axis=-1)
 

Mutant 357

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -278,7 +278,7 @@
         q1_losses = []
         q2_losses = []
         # Multiple q losses per stream
-        expanded_dones = tf.expand_dims(self.dones_holder, axis=-1)
+        expanded_dones = tf.expand_dims(self.dones_holder, axis=-2)
         for i, name in enumerate(stream_names):
             _expanded_rewards = tf.expand_dims(self.rewards_holders[name], axis=-1)
 

Mutant 359

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -280,7 +280,7 @@
         # Multiple q losses per stream
         expanded_dones = tf.expand_dims(self.dones_holder, axis=-1)
         for i, name in enumerate(stream_names):
-            _expanded_rewards = tf.expand_dims(self.rewards_holders[name], axis=-1)
+            _expanded_rewards = tf.expand_dims(self.rewards_holders[name], axis=+1)
 
             q_backup = tf.stop_gradient(
                 _expanded_rewards

Mutant 362

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -284,7 +284,7 @@
 
             q_backup = tf.stop_gradient(
                 _expanded_rewards
-                + (1.0 - self.use_dones_in_backup[name] * expanded_dones)
+                - (1.0 - self.use_dones_in_backup[name] * expanded_dones)
                 * self.gammas[i]
                 * self.target_network.value_heads[name]
             )

Mutant 366

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -285,7 +285,7 @@
             q_backup = tf.stop_gradient(
                 _expanded_rewards
                 + (1.0 - self.use_dones_in_backup[name] * expanded_dones)
-                * self.gammas[i]
+                / self.gammas[i]
                 * self.target_network.value_heads[name]
             )
 

Mutant 371

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -300,7 +300,7 @@
 
                 # Reduce each branch into scalar
                 branched_q1_stream = [
-                    tf.reduce_sum(_branch, axis=1, keep_dims=True)
+                    tf.reduce_sum(_branch, axis=1, keep_dims=False)
                     for _branch in branched_q1_stream
                 ]
                 branched_q2_stream = [

Mutant 380

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -315,7 +315,7 @@
                 q1_stream = q1_streams[name]
                 q2_stream = q2_streams[name]
 
-            _q1_loss = 0.5 * tf.reduce_mean(
+            _q1_loss = 1.5 * tf.reduce_mean(
                 tf.to_float(self.policy.mask)
                 * tf.squared_difference(q_backup, q1_stream)
             )

Mutant 383

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -320,7 +320,7 @@
                 * tf.squared_difference(q_backup, q1_stream)
             )
 
-            _q2_loss = 0.5 * tf.reduce_mean(
+            _q2_loss = 1.5 * tf.reduce_mean(
                 tf.to_float(self.policy.mask)
                 * tf.squared_difference(q_backup, q2_stream)
             )

Mutant 388

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -335,7 +335,7 @@
         if discrete:
             # Create a log_ent_coef for each branch
             self.log_ent_coef = tf.get_variable(
-                "log_ent_coef",
+                "XXlog_ent_coefXX",
                 dtype=tf.float32,
                 initializer=np.log([self.init_entcoef] * len(self.act_size)).astype(
                     np.float32

Mutant 390

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -340,7 +340,7 @@
                 initializer=np.log([self.init_entcoef] * len(self.act_size)).astype(
                     np.float32
                 ),
-                trainable=True,
+                trainable=False,
             )
         else:
             self.log_ent_coef = tf.get_variable(

Mutant 391

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -344,7 +344,7 @@
             )
         else:
             self.log_ent_coef = tf.get_variable(
-                "log_ent_coef",
+                "XXlog_ent_coefXX",
                 dtype=tf.float32,
                 initializer=np.log(self.init_entcoef).astype(np.float32),
                 trainable=True,

Mutant 392

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -347,7 +347,7 @@
                 "log_ent_coef",
                 dtype=tf.float32,
                 initializer=np.log(self.init_entcoef).astype(np.float32),
-                trainable=True,
+                trainable=False,
             )
 
         self.ent_coef = tf.exp(self.log_ent_coef)

Mutant 396

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -358,7 +358,7 @@
             )
             branched_ent_sums = tf.stack(
                 [
-                    tf.reduce_sum(_lp, axis=1, keep_dims=True) + _te
+                    tf.reduce_sum(_lp, axis=1, keep_dims=True) - _te
                     for _lp, _te in zip(branched_per_action_ent, self.target_entropy)
                 ],
                 axis=1,

Mutant 400

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -367,7 +367,7 @@
                 tf.to_float(self.policy.mask)
                 * tf.reduce_mean(
                     self.log_ent_coef
-                    * tf.squeeze(tf.stop_gradient(branched_ent_sums), axis=2),
+                    / tf.squeeze(tf.stop_gradient(branched_ent_sums), axis=2),
                     axis=1,
                 )
             )

Mutant 407

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -381,7 +381,7 @@
 
             branched_policy_loss = tf.stack(
                 [
-                    tf.reduce_sum(self.ent_coef[i] * _lp - _qt, axis=1, keep_dims=True)
+                    tf.reduce_sum(self.ent_coef[i] * _lp - _qt, axis=1, keep_dims=False)
                     for i, (_lp, _qt) in enumerate(
                         zip(branched_per_action_ent, branched_q_term)
                     )

Mutant 411

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -394,7 +394,7 @@
             # Do vbackup entropy bonus per branch as well.
             branched_ent_bonus = tf.stack(
                 [
-                    tf.reduce_sum(self.ent_coef[i] * _lp, axis=1, keep_dims=True)
+                    tf.reduce_sum(self.ent_coef[i] * _lp, axis=1, keep_dims=False)
                     for i, _lp in enumerate(branched_per_action_ent)
                 ]
             )

Mutant 413

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -402,7 +402,7 @@
             for name in stream_names:
                 v_backup = tf.stop_gradient(
                     self.min_policy_qs[name]
-                    - tf.reduce_mean(branched_ent_bonus, axis=0)
+                    + tf.reduce_mean(branched_ent_bonus, axis=0)
                 )
                 value_losses.append(
                     0.5

Mutant 415

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -405,7 +405,7 @@
                     - tf.reduce_mean(branched_ent_bonus, axis=0)
                 )
                 value_losses.append(
-                    0.5
+                    1.5
                     * tf.reduce_mean(
                         tf.to_float(self.policy.mask)
                         * tf.squared_difference(

Mutant 420

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -418,7 +418,7 @@
             self.entropy_loss = -tf.reduce_mean(
                 self.log_ent_coef
                 * tf.to_float(self.policy.mask)
-                * tf.stop_gradient(
+                / tf.stop_gradient(
                     tf.reduce_sum(
                         self.policy.all_log_probs + self.target_entropy,
                         axis=1,

Mutant 423

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -422,7 +422,7 @@
                     tf.reduce_sum(
                         self.policy.all_log_probs + self.target_entropy,
                         axis=1,
-                        keep_dims=True,
+                        keep_dims=False,
                     )
                 )
             )

Mutant 429

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -438,7 +438,7 @@
             for name in stream_names:
                 v_backup = tf.stop_gradient(
                     self.min_policy_qs[name]
-                    - tf.reduce_sum(self.ent_coef * self.policy.all_log_probs, axis=1)
+                    + tf.reduce_sum(self.ent_coef * self.policy.all_log_probs, axis=1)
                 )
                 value_losses.append(
                     0.5

Mutant 430

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -438,7 +438,7 @@
             for name in stream_names:
                 v_backup = tf.stop_gradient(
                     self.min_policy_qs[name]
-                    - tf.reduce_sum(self.ent_coef * self.policy.all_log_probs, axis=1)
+                    - tf.reduce_sum(self.ent_coef / self.policy.all_log_probs, axis=1)
                 )
                 value_losses.append(
                     0.5

Mutant 432

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -441,7 +441,7 @@
                     - tf.reduce_sum(self.ent_coef * self.policy.all_log_probs, axis=1)
                 )
                 value_losses.append(
-                    0.5
+                    1.5
                     * tf.reduce_mean(
                         tf.to_float(self.policy.mask)
                         * tf.squared_difference(

Mutant 439

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -453,7 +453,7 @@
 
         self.total_value_loss = self.q1_loss + self.q2_loss + self.value_loss
 
-        self.entropy = self.policy_network.entropy
+        self.entropy = None
 
     def _create_sac_optimizer_ops(self) -> None:
         """

Mutant 440

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -461,7 +461,7 @@
         the policy, value, and entropy updates, as well as the target network update.
         """
         policy_optimizer = self.create_optimizer_op(
-            learning_rate=self.learning_rate, name="sac_policy_opt"
+            learning_rate=self.learning_rate, name="XXsac_policy_optXX"
         )
         entropy_optimizer = self.create_optimizer_op(
             learning_rate=self.learning_rate, name="sac_entropy_opt"

Mutant 441

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -464,7 +464,7 @@
             learning_rate=self.learning_rate, name="sac_policy_opt"
         )
         entropy_optimizer = self.create_optimizer_op(
-            learning_rate=self.learning_rate, name="sac_entropy_opt"
+            learning_rate=self.learning_rate, name="XXsac_entropy_optXX"
         )
         value_optimizer = self.create_optimizer_op(
             learning_rate=self.learning_rate, name="sac_value_opt"

Mutant 442

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -467,7 +467,7 @@
             learning_rate=self.learning_rate, name="sac_entropy_opt"
         )
         value_optimizer = self.create_optimizer_op(
-            learning_rate=self.learning_rate, name="sac_value_opt"
+            learning_rate=self.learning_rate, name="XXsac_value_optXX"
         )
 
         self.target_update_op = [

Mutant 448

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -476,7 +476,7 @@
                 self.target_network.value_vars, self.policy_network.value_vars
             )
         ]
-        logger.debug("value_vars")
+        logger.debug("XXvalue_varsXX")
         self.print_all_vars(self.policy_network.value_vars)
         logger.debug("targvalue_vars")
         self.print_all_vars(self.target_network.value_vars)

Mutant 449

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -478,7 +478,7 @@
         ]
         logger.debug("value_vars")
         self.print_all_vars(self.policy_network.value_vars)
-        logger.debug("targvalue_vars")
+        logger.debug("XXtargvalue_varsXX")
         self.print_all_vars(self.target_network.value_vars)
         logger.debug("critic_vars")
         self.print_all_vars(self.policy_network.critic_vars)

Mutant 450

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -480,7 +480,7 @@
         self.print_all_vars(self.policy_network.value_vars)
         logger.debug("targvalue_vars")
         self.print_all_vars(self.target_network.value_vars)
-        logger.debug("critic_vars")
+        logger.debug("XXcritic_varsXX")
         self.print_all_vars(self.policy_network.critic_vars)
         logger.debug("q_vars")
         self.print_all_vars(self.policy_network.q_vars)

Mutant 451

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -482,7 +482,7 @@
         self.print_all_vars(self.target_network.value_vars)
         logger.debug("critic_vars")
         self.print_all_vars(self.policy_network.critic_vars)
-        logger.debug("q_vars")
+        logger.debug("XXq_varsXX")
         self.print_all_vars(self.policy_network.q_vars)
         logger.debug("policy_vars")
         policy_vars = self.policy.get_trainable_variables()

Mutant 452

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -484,7 +484,7 @@
         self.print_all_vars(self.policy_network.critic_vars)
         logger.debug("q_vars")
         self.print_all_vars(self.policy_network.q_vars)
-        logger.debug("policy_vars")
+        logger.debug("XXpolicy_varsXX")
         policy_vars = self.policy.get_trainable_variables()
         self.print_all_vars(policy_vars)
 

Mutant 454

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -514,7 +514,6 @@
         for _var in variables:
             logger.debug(_var)
 
-    @timed
     def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
         """
         Updates model using buffer.

Mutant 466

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -596,7 +596,7 @@
         :param num_sequences: Number of LSTM sequences in batch.
         """
         # Do an optional burn-in for memories
-        num_burn_in = int(self.burn_in_ratio * self.policy.sequence_length)
+        num_burn_in = int(self.burn_in_ratio / self.policy.sequence_length)
         burn_in_mask = np.ones((self.policy.sequence_length), dtype=np.float32)
         burn_in_mask[range(0, num_burn_in)] = 0
         burn_in_mask = np.tile(burn_in_mask, num_sequences)

Mutant 469

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -598,7 +598,7 @@
         # Do an optional burn-in for memories
         num_burn_in = int(self.burn_in_ratio * self.policy.sequence_length)
         burn_in_mask = np.ones((self.policy.sequence_length), dtype=np.float32)
-        burn_in_mask[range(0, num_burn_in)] = 0
+        burn_in_mask[range(1, num_burn_in)] = 0
         burn_in_mask = np.tile(burn_in_mask, num_sequences)
         feed_dict = {
             policy.batch_size_ph: num_sequences,

Mutant 470

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -598,7 +598,7 @@
         # Do an optional burn-in for memories
         num_burn_in = int(self.burn_in_ratio * self.policy.sequence_length)
         burn_in_mask = np.ones((self.policy.sequence_length), dtype=np.float32)
-        burn_in_mask[range(0, num_burn_in)] = 0
+        burn_in_mask[range(0, num_burn_in)] = 1
         burn_in_mask = np.tile(burn_in_mask, num_sequences)
         feed_dict = {
             policy.batch_size_ph: num_sequences,

Mutant 471

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -598,7 +598,7 @@
         # Do an optional burn-in for memories
         num_burn_in = int(self.burn_in_ratio * self.policy.sequence_length)
         burn_in_mask = np.ones((self.policy.sequence_length), dtype=np.float32)
-        burn_in_mask[range(0, num_burn_in)] = 0
+        burn_in_mask[range(0, num_burn_in)] = None
         burn_in_mask = np.tile(burn_in_mask, num_sequences)
         feed_dict = {
             policy.batch_size_ph: num_sequences,

Mutant 474

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -604,7 +604,7 @@
             policy.batch_size_ph: num_sequences,
             policy.sequence_length_ph: self.policy.sequence_length,
             self.next_sequence_length_ph: self.policy.sequence_length,
-            self.policy.mask_input: batch["masks"] * burn_in_mask,
+            self.policy.mask_input: batch["masks"] / burn_in_mask,
         }
         for name in self.reward_signals:
             feed_dict[self.rewards_holders[name]] = batch[f"{name}_rewards"]

Mutant 488

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -619,7 +619,7 @@
         if self.policy.use_vec_obs:
             feed_dict[policy.vector_in] = batch["vector_obs"]
             feed_dict[self.next_vector_in] = batch["next_vector_in"]
-        if self.policy.vis_obs_size > 0:
+        if self.policy.vis_obs_size >= 0:
             for i, _ in enumerate(policy.visual_in):
                 _obs = batch["visual_obs%d" % i]
                 feed_dict[policy.visual_in[i]] = _obs

Mutant 499

--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -629,7 +629,7 @@
         if self.policy.use_recurrent:
             feed_dict[policy.memory_in] = [
                 batch["memory"][i]
-                for i in range(0, len(batch["memory"]), self.policy.sequence_length)
+                for i in range(1, len(batch["memory"]), self.policy.sequence_length)
             ]
             feed_dict[self.policy_network.memory_in] = self._make_zero_mem(
                 self.m_size, batch.num_experiences