ml-agents/mlagents/trainers/sac/optimizer.py
Killed 171 out of 253 mutantsSurvived
Survived mutation testing. These mutants show holes in your test suite.Mutant 252
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -12,7 +12,7 @@
from mlagents_envs.timers import timed
from mlagents.trainers.settings import TrainerSettings, SACSettings
-EPSILON = 1e-6 # Small value to avoid divide by zero
+EPSILON = 1.000001 # Small value to avoid divide by zero
logger = get_logger(__name__)
Mutant 253
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -12,7 +12,7 @@
from mlagents_envs.timers import timed
from mlagents.trainers.settings import TrainerSettings, SACSettings
-EPSILON = 1e-6 # Small value to avoid divide by zero
+EPSILON = None # Small value to avoid divide by zero
logger = get_logger(__name__)
Mutant 255
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -16,7 +16,7 @@
logger = get_logger(__name__)
-POLICY_SCOPE = ""
+POLICY_SCOPE = "XXXX"
TARGET_SCOPE = "target_network"
Mutant 256
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -16,7 +16,7 @@
logger = get_logger(__name__)
-POLICY_SCOPE = ""
+POLICY_SCOPE = None
TARGET_SCOPE = "target_network"
Mutant 257
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -17,7 +17,7 @@
logger = get_logger(__name__)
POLICY_SCOPE = ""
-TARGET_SCOPE = "target_network"
+TARGET_SCOPE = "XXtarget_networkXX"
class SACOptimizer(TFOptimizer):
Mutant 258
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -17,7 +17,7 @@
logger = get_logger(__name__)
POLICY_SCOPE = ""
-TARGET_SCOPE = "target_network"
+TARGET_SCOPE = None
class SACOptimizer(TFOptimizer):
Mutant 263
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -51,7 +51,7 @@
lr = hyperparameters.learning_rate
lr_schedule = hyperparameters.learning_rate_schedule
max_step = trainer_params.max_steps
- self.tau = hyperparameters.tau
+ self.tau = None
self.init_entcoef = hyperparameters.init_entcoef
self.policy = policy
Mutant 278
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -76,7 +76,7 @@
_val.gamma for _val in trainer_params.reward_signals.values()
]
self.use_dones_in_backup = {
- name: tf.Variable(1.0) for name in stream_names
+ name: tf.Variable(2.0) for name in stream_names
}
self.disable_use_dones = {
name: self.use_dones_in_backup[name].assign(0.0)
Mutant 279
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -79,7 +79,7 @@
name: tf.Variable(1.0) for name in stream_names
}
self.disable_use_dones = {
- name: self.use_dones_in_backup[name].assign(0.0)
+ name: self.use_dones_in_backup[name].assign(1.0)
for name in stream_names
}
Mutant 280
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -83,7 +83,7 @@
for name in stream_names
}
- if num_layers < 1:
+ if num_layers <= 1:
num_layers = 1
self.target_init_op: List[tf.Tensor] = []
Mutant 281
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -83,7 +83,7 @@
for name in stream_names
}
- if num_layers < 1:
+ if num_layers < 2:
num_layers = 1
self.target_init_op: List[tf.Tensor] = []
Mutant 282
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -86,7 +86,7 @@
if num_layers < 1:
num_layers = 1
- self.target_init_op: List[tf.Tensor] = []
+ self.target_init_op: List[tf.Tensor] = None
self.target_update_op: List[tf.Tensor] = []
self.update_batch_policy: Optional[tf.Operation] = None
self.update_batch_value: Optional[tf.Operation] = None
Mutant 283
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -87,7 +87,7 @@
num_layers = 1
self.target_init_op: List[tf.Tensor] = []
- self.target_update_op: List[tf.Tensor] = []
+ self.target_update_op: List[tf.Tensor] = None
self.update_batch_policy: Optional[tf.Operation] = None
self.update_batch_value: Optional[tf.Operation] = None
self.update_batch_entropy: Optional[tf.Operation] = None
Mutant 284
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -88,7 +88,7 @@
self.target_init_op: List[tf.Tensor] = []
self.target_update_op: List[tf.Tensor] = []
- self.update_batch_policy: Optional[tf.Operation] = None
+ self.update_batch_policy: Optional[tf.Operation] = ""
self.update_batch_value: Optional[tf.Operation] = None
self.update_batch_entropy: Optional[tf.Operation] = None
Mutant 285
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -89,7 +89,7 @@
self.target_init_op: List[tf.Tensor] = []
self.target_update_op: List[tf.Tensor] = []
self.update_batch_policy: Optional[tf.Operation] = None
- self.update_batch_value: Optional[tf.Operation] = None
+ self.update_batch_value: Optional[tf.Operation] = ""
self.update_batch_entropy: Optional[tf.Operation] = None
self.policy_network = SACPolicyNetwork(
Mutant 286
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -90,7 +90,7 @@
self.target_update_op: List[tf.Tensor] = []
self.update_batch_policy: Optional[tf.Operation] = None
self.update_batch_value: Optional[tf.Operation] = None
- self.update_batch_entropy: Optional[tf.Operation] = None
+ self.update_batch_entropy: Optional[tf.Operation] = ""
self.policy_network = SACPolicyNetwork(
policy=self.policy,
Mutant 290
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -120,7 +120,7 @@
lr,
self.policy.global_step,
int(max_step),
- min_value=1e-10,
+ min_value=1.0000000001,
)
self._create_losses(
self.policy_network.q1_heads,
Mutant 292
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -149,7 +149,7 @@
self.policy.initialize_or_load()
self.stats_name_to_update_name = {
- "Losses/Value Loss": "value_loss",
+ "XXLosses/Value LossXX": "value_loss",
"Losses/Policy Loss": "policy_loss",
"Losses/Q1 Loss": "q1_loss",
"Losses/Q2 Loss": "q2_loss",
Mutant 294
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -150,7 +150,7 @@
self.stats_name_to_update_name = {
"Losses/Value Loss": "value_loss",
- "Losses/Policy Loss": "policy_loss",
+ "XXLosses/Policy LossXX": "policy_loss",
"Losses/Q1 Loss": "q1_loss",
"Losses/Q2 Loss": "q2_loss",
"Policy/Entropy Coeff": "entropy_coef",
Mutant 296
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -151,7 +151,7 @@
self.stats_name_to_update_name = {
"Losses/Value Loss": "value_loss",
"Losses/Policy Loss": "policy_loss",
- "Losses/Q1 Loss": "q1_loss",
+ "XXLosses/Q1 LossXX": "q1_loss",
"Losses/Q2 Loss": "q2_loss",
"Policy/Entropy Coeff": "entropy_coef",
"Policy/Learning Rate": "learning_rate",
Mutant 298
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -152,7 +152,7 @@
"Losses/Value Loss": "value_loss",
"Losses/Policy Loss": "policy_loss",
"Losses/Q1 Loss": "q1_loss",
- "Losses/Q2 Loss": "q2_loss",
+ "XXLosses/Q2 LossXX": "q2_loss",
"Policy/Entropy Coeff": "entropy_coef",
"Policy/Learning Rate": "learning_rate",
}
Mutant 300
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -153,7 +153,7 @@
"Losses/Policy Loss": "policy_loss",
"Losses/Q1 Loss": "q1_loss",
"Losses/Q2 Loss": "q2_loss",
- "Policy/Entropy Coeff": "entropy_coef",
+ "XXPolicy/Entropy CoeffXX": "entropy_coef",
"Policy/Learning Rate": "learning_rate",
}
Mutant 302
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -154,7 +154,7 @@
"Losses/Q1 Loss": "q1_loss",
"Losses/Q2 Loss": "q2_loss",
"Policy/Entropy Coeff": "entropy_coef",
- "Policy/Learning Rate": "learning_rate",
+ "XXPolicy/Learning RateXX": "learning_rate",
}
self.update_dict = {
Mutant 309
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -163,7 +163,7 @@
"q1_loss": self.q1_loss,
"q2_loss": self.q2_loss,
"entropy_coef": self.ent_coef,
- "update_batch": self.update_batch_policy,
+ "XXupdate_batchXX": self.update_batch_policy,
"update_value": self.update_batch_value,
"update_entropy": self.update_batch_entropy,
"learning_rate": self.learning_rate,
Mutant 310
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -164,7 +164,7 @@
"q2_loss": self.q2_loss,
"entropy_coef": self.ent_coef,
"update_batch": self.update_batch_policy,
- "update_value": self.update_batch_value,
+ "XXupdate_valueXX": self.update_batch_value,
"update_entropy": self.update_batch_entropy,
"learning_rate": self.learning_rate,
}
Mutant 311
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -165,7 +165,7 @@
"entropy_coef": self.ent_coef,
"update_batch": self.update_batch_policy,
"update_value": self.update_batch_value,
- "update_entropy": self.update_batch_entropy,
+ "XXupdate_entropyXX": self.update_batch_entropy,
"learning_rate": self.learning_rate,
}
Mutant 313
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -174,7 +174,7 @@
Assign the higher-level SACModel's inputs and outputs to those of its policy or
target network.
"""
- self.vector_in = self.policy.vector_in
+ self.vector_in = None
self.visual_in = self.policy.visual_in
self.next_vector_in = self.target_network.vector_in
self.next_visual_in = self.target_network.visual_in
Mutant 314
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -175,7 +175,7 @@
target network.
"""
self.vector_in = self.policy.vector_in
- self.visual_in = self.policy.visual_in
+ self.visual_in = None
self.next_vector_in = self.target_network.vector_in
self.next_visual_in = self.target_network.visual_in
self.sequence_length_ph = self.policy.sequence_length_ph
Mutant 317
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -178,7 +178,7 @@
self.visual_in = self.policy.visual_in
self.next_vector_in = self.target_network.vector_in
self.next_visual_in = self.target_network.visual_in
- self.sequence_length_ph = self.policy.sequence_length_ph
+ self.sequence_length_ph = None
self.next_sequence_length_ph = self.target_network.sequence_length_ph
if not self.policy.use_continuous_act:
self.action_masks = self.policy_network.action_masks
Mutant 319
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -180,7 +180,7 @@
self.next_visual_in = self.target_network.visual_in
self.sequence_length_ph = self.policy.sequence_length_ph
self.next_sequence_length_ph = self.target_network.sequence_length_ph
- if not self.policy.use_continuous_act:
+ if self.policy.use_continuous_act:
self.action_masks = self.policy_network.action_masks
else:
self.output_pre = self.policy_network.output_pre
Mutant 320
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -181,7 +181,7 @@
self.sequence_length_ph = self.policy.sequence_length_ph
self.next_sequence_length_ph = self.target_network.sequence_length_ph
if not self.policy.use_continuous_act:
- self.action_masks = self.policy_network.action_masks
+ self.action_masks = None
else:
self.output_pre = self.policy_network.output_pre
Mutant 321
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -183,7 +183,7 @@
if not self.policy.use_continuous_act:
self.action_masks = self.policy_network.action_masks
else:
- self.output_pre = self.policy_network.output_pre
+ self.output_pre = None
# Don't use value estimate during inference.
self.value = tf.identity(
Mutant 322
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -187,7 +187,7 @@
# Don't use value estimate during inference.
self.value = tf.identity(
- self.policy_network.value, name="value_estimate_unused"
+ self.policy_network.value, name="XXvalue_estimate_unusedXX"
)
self.value_heads = self.policy_network.value_heads
self.dones_holder = tf.placeholder(
Mutant 324
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -191,7 +191,7 @@
)
self.value_heads = self.policy_network.value_heads
self.dones_holder = tf.placeholder(
- shape=[None], dtype=tf.float32, name="dones_holder"
+ shape=[None], dtype=tf.float32, name="XXdones_holderXX"
)
if self.policy.use_recurrent:
Mutant 327
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -197,7 +197,7 @@
if self.policy.use_recurrent:
self.memory_in = self.policy_network.memory_in
self.memory_out = self.policy_network.memory_out
- if not self.policy.use_continuous_act:
+ if self.policy.use_continuous_act:
self.prev_action = self.policy_network.prev_action
self.next_memory_in = self.target_network.memory_in
Mutant 328
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -198,7 +198,7 @@
self.memory_in = self.policy_network.memory_in
self.memory_out = self.policy_network.memory_out
if not self.policy.use_continuous_act:
- self.prev_action = self.policy_network.prev_action
+ self.prev_action = None
self.next_memory_in = self.target_network.memory_in
def _create_losses(
Mutant 329
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -199,7 +199,7 @@
self.memory_out = self.policy_network.memory_out
if not self.policy.use_continuous_act:
self.prev_action = self.policy_network.prev_action
- self.next_memory_in = self.target_network.memory_in
+ self.next_memory_in = None
def _create_losses(
self,
Mutant 330
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -208,7 +208,7 @@
lr: tf.Tensor,
max_step: int,
stream_names: List[str],
- discrete: bool = False,
+ discrete: bool = True,
) -> None:
"""
Creates training-specific Tensorflow ops for SAC models.
Mutant 331
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -222,7 +222,7 @@
if discrete:
self.target_entropy = [
- self.discrete_target_entropy_scale * np.log(i).astype(np.float32)
+ self.discrete_target_entropy_scale / np.log(i).astype(np.float32)
for i in self.act_size
]
discrete_action_probs = tf.exp(self.policy.all_log_probs)
Mutant 337
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -230,7 +230,7 @@
else:
self.target_entropy = (
-1
- * self.continuous_target_entropy_scale
+ / self.continuous_target_entropy_scale
* np.prod(self.act_size[0]).astype(np.float32)
)
Mutant 338
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -231,7 +231,7 @@
self.target_entropy = (
-1
* self.continuous_target_entropy_scale
- * np.prod(self.act_size[0]).astype(np.float32)
+ / np.prod(self.act_size[0]).astype(np.float32)
)
self.rewards_holders = {}
Mutant 344
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -245,7 +245,7 @@
)
branched_mpq1 = tf.stack(
[
- tf.reduce_sum(_br, axis=1, keep_dims=True)
+ tf.reduce_sum(_br, axis=1, keep_dims=False)
for _br in _branched_mpq1
]
)
Mutant 349
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -257,7 +257,7 @@
)
branched_mpq2 = tf.stack(
[
- tf.reduce_sum(_br, axis=1, keep_dims=True)
+ tf.reduce_sum(_br, axis=1, keep_dims=False)
for _br in _branched_mpq2
]
)
Mutant 356
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -278,7 +278,7 @@
q1_losses = []
q2_losses = []
# Multiple q losses per stream
- expanded_dones = tf.expand_dims(self.dones_holder, axis=-1)
+ expanded_dones = tf.expand_dims(self.dones_holder, axis=+1)
for i, name in enumerate(stream_names):
_expanded_rewards = tf.expand_dims(self.rewards_holders[name], axis=-1)
Mutant 357
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -278,7 +278,7 @@
q1_losses = []
q2_losses = []
# Multiple q losses per stream
- expanded_dones = tf.expand_dims(self.dones_holder, axis=-1)
+ expanded_dones = tf.expand_dims(self.dones_holder, axis=-2)
for i, name in enumerate(stream_names):
_expanded_rewards = tf.expand_dims(self.rewards_holders[name], axis=-1)
Mutant 359
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -280,7 +280,7 @@
# Multiple q losses per stream
expanded_dones = tf.expand_dims(self.dones_holder, axis=-1)
for i, name in enumerate(stream_names):
- _expanded_rewards = tf.expand_dims(self.rewards_holders[name], axis=-1)
+ _expanded_rewards = tf.expand_dims(self.rewards_holders[name], axis=+1)
q_backup = tf.stop_gradient(
_expanded_rewards
Mutant 362
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -284,7 +284,7 @@
q_backup = tf.stop_gradient(
_expanded_rewards
- + (1.0 - self.use_dones_in_backup[name] * expanded_dones)
+ - (1.0 - self.use_dones_in_backup[name] * expanded_dones)
* self.gammas[i]
* self.target_network.value_heads[name]
)
Mutant 373
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -304,7 +304,7 @@
for _branch in branched_q1_stream
]
branched_q2_stream = [
- tf.reduce_sum(_branch, axis=1, keep_dims=True)
+ tf.reduce_sum(_branch, axis=1, keep_dims=False)
for _branch in branched_q2_stream
]
Mutant 380
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -315,7 +315,7 @@
q1_stream = q1_streams[name]
q2_stream = q2_streams[name]
- _q1_loss = 0.5 * tf.reduce_mean(
+ _q1_loss = 1.5 * tf.reduce_mean(
tf.to_float(self.policy.mask)
* tf.squared_difference(q_backup, q1_stream)
)
Mutant 388
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -335,7 +335,7 @@
if discrete:
# Create a log_ent_coef for each branch
self.log_ent_coef = tf.get_variable(
- "log_ent_coef",
+ "XXlog_ent_coefXX",
dtype=tf.float32,
initializer=np.log([self.init_entcoef] * len(self.act_size)).astype(
np.float32
Mutant 390
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -340,7 +340,7 @@
initializer=np.log([self.init_entcoef] * len(self.act_size)).astype(
np.float32
),
- trainable=True,
+ trainable=False,
)
else:
self.log_ent_coef = tf.get_variable(
Mutant 391
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -344,7 +344,7 @@
)
else:
self.log_ent_coef = tf.get_variable(
- "log_ent_coef",
+ "XXlog_ent_coefXX",
dtype=tf.float32,
initializer=np.log(self.init_entcoef).astype(np.float32),
trainable=True,
Mutant 392
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -347,7 +347,7 @@
"log_ent_coef",
dtype=tf.float32,
initializer=np.log(self.init_entcoef).astype(np.float32),
- trainable=True,
+ trainable=False,
)
self.ent_coef = tf.exp(self.log_ent_coef)
Mutant 396
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -358,7 +358,7 @@
)
branched_ent_sums = tf.stack(
[
- tf.reduce_sum(_lp, axis=1, keep_dims=True) + _te
+ tf.reduce_sum(_lp, axis=1, keep_dims=True) - _te
for _lp, _te in zip(branched_per_action_ent, self.target_entropy)
],
axis=1,
Mutant 399
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -365,7 +365,7 @@
)
self.entropy_loss = -tf.reduce_mean(
tf.to_float(self.policy.mask)
- * tf.reduce_mean(
+ / tf.reduce_mean(
self.log_ent_coef
* tf.squeeze(tf.stop_gradient(branched_ent_sums), axis=2),
axis=1,
Mutant 400
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -367,7 +367,7 @@
tf.to_float(self.policy.mask)
* tf.reduce_mean(
self.log_ent_coef
- * tf.squeeze(tf.stop_gradient(branched_ent_sums), axis=2),
+ / tf.squeeze(tf.stop_gradient(branched_ent_sums), axis=2),
axis=1,
)
)
Mutant 407
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -381,7 +381,7 @@
branched_policy_loss = tf.stack(
[
- tf.reduce_sum(self.ent_coef[i] * _lp - _qt, axis=1, keep_dims=True)
+ tf.reduce_sum(self.ent_coef[i] * _lp - _qt, axis=1, keep_dims=False)
for i, (_lp, _qt) in enumerate(
zip(branched_per_action_ent, branched_q_term)
)
Mutant 411
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -394,7 +394,7 @@
# Do vbackup entropy bonus per branch as well.
branched_ent_bonus = tf.stack(
[
- tf.reduce_sum(self.ent_coef[i] * _lp, axis=1, keep_dims=True)
+ tf.reduce_sum(self.ent_coef[i] * _lp, axis=1, keep_dims=False)
for i, _lp in enumerate(branched_per_action_ent)
]
)
Mutant 420
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -418,7 +418,7 @@
self.entropy_loss = -tf.reduce_mean(
self.log_ent_coef
* tf.to_float(self.policy.mask)
- * tf.stop_gradient(
+ / tf.stop_gradient(
tf.reduce_sum(
self.policy.all_log_probs + self.target_entropy,
axis=1,
Mutant 423
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -422,7 +422,7 @@
tf.reduce_sum(
self.policy.all_log_probs + self.target_entropy,
axis=1,
- keep_dims=True,
+ keep_dims=False,
)
)
)
Mutant 429
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -438,7 +438,7 @@
for name in stream_names:
v_backup = tf.stop_gradient(
self.min_policy_qs[name]
- - tf.reduce_sum(self.ent_coef * self.policy.all_log_probs, axis=1)
+ + tf.reduce_sum(self.ent_coef * self.policy.all_log_probs, axis=1)
)
value_losses.append(
0.5
Mutant 430
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -438,7 +438,7 @@
for name in stream_names:
v_backup = tf.stop_gradient(
self.min_policy_qs[name]
- - tf.reduce_sum(self.ent_coef * self.policy.all_log_probs, axis=1)
+ - tf.reduce_sum(self.ent_coef / self.policy.all_log_probs, axis=1)
)
value_losses.append(
0.5
Mutant 432
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -441,7 +441,7 @@
- tf.reduce_sum(self.ent_coef * self.policy.all_log_probs, axis=1)
)
value_losses.append(
- 0.5
+ 1.5
* tf.reduce_mean(
tf.to_float(self.policy.mask)
* tf.squared_difference(
Mutant 433
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -442,7 +442,7 @@
)
value_losses.append(
0.5
- * tf.reduce_mean(
+ / tf.reduce_mean(
tf.to_float(self.policy.mask)
* tf.squared_difference(
self.policy_network.value_heads[name], v_backup
Mutant 439
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -453,7 +453,7 @@
self.total_value_loss = self.q1_loss + self.q2_loss + self.value_loss
- self.entropy = self.policy_network.entropy
+ self.entropy = None
def _create_sac_optimizer_ops(self) -> None:
"""
Mutant 440
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -461,7 +461,7 @@
the policy, value, and entropy updates, as well as the target network update.
"""
policy_optimizer = self.create_optimizer_op(
- learning_rate=self.learning_rate, name="sac_policy_opt"
+ learning_rate=self.learning_rate, name="XXsac_policy_optXX"
)
entropy_optimizer = self.create_optimizer_op(
learning_rate=self.learning_rate, name="sac_entropy_opt"
Mutant 441
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -464,7 +464,7 @@
learning_rate=self.learning_rate, name="sac_policy_opt"
)
entropy_optimizer = self.create_optimizer_op(
- learning_rate=self.learning_rate, name="sac_entropy_opt"
+ learning_rate=self.learning_rate, name="XXsac_entropy_optXX"
)
value_optimizer = self.create_optimizer_op(
learning_rate=self.learning_rate, name="sac_value_opt"
Mutant 442
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -467,7 +467,7 @@
learning_rate=self.learning_rate, name="sac_entropy_opt"
)
value_optimizer = self.create_optimizer_op(
- learning_rate=self.learning_rate, name="sac_value_opt"
+ learning_rate=self.learning_rate, name="XXsac_value_optXX"
)
self.target_update_op = [
Mutant 446
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -471,7 +471,7 @@
)
self.target_update_op = [
- tf.assign(target, (1 - self.tau) * target + self.tau * source)
+ tf.assign(target, (1 - self.tau) * target - self.tau * source)
for target, source in zip(
self.target_network.value_vars, self.policy_network.value_vars
)
Mutant 448
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -476,7 +476,7 @@
self.target_network.value_vars, self.policy_network.value_vars
)
]
- logger.debug("value_vars")
+ logger.debug("XXvalue_varsXX")
self.print_all_vars(self.policy_network.value_vars)
logger.debug("targvalue_vars")
self.print_all_vars(self.target_network.value_vars)
Mutant 449
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -478,7 +478,7 @@
]
logger.debug("value_vars")
self.print_all_vars(self.policy_network.value_vars)
- logger.debug("targvalue_vars")
+ logger.debug("XXtargvalue_varsXX")
self.print_all_vars(self.target_network.value_vars)
logger.debug("critic_vars")
self.print_all_vars(self.policy_network.critic_vars)
Mutant 450
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -480,7 +480,7 @@
self.print_all_vars(self.policy_network.value_vars)
logger.debug("targvalue_vars")
self.print_all_vars(self.target_network.value_vars)
- logger.debug("critic_vars")
+ logger.debug("XXcritic_varsXX")
self.print_all_vars(self.policy_network.critic_vars)
logger.debug("q_vars")
self.print_all_vars(self.policy_network.q_vars)
Mutant 451
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -482,7 +482,7 @@
self.print_all_vars(self.target_network.value_vars)
logger.debug("critic_vars")
self.print_all_vars(self.policy_network.critic_vars)
- logger.debug("q_vars")
+ logger.debug("XXq_varsXX")
self.print_all_vars(self.policy_network.q_vars)
logger.debug("policy_vars")
policy_vars = self.policy.get_trainable_variables()
Mutant 452
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -484,7 +484,7 @@
self.print_all_vars(self.policy_network.critic_vars)
logger.debug("q_vars")
self.print_all_vars(self.policy_network.q_vars)
- logger.debug("policy_vars")
+ logger.debug("XXpolicy_varsXX")
policy_vars = self.policy.get_trainable_variables()
self.print_all_vars(policy_vars)
Mutant 454
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -514,7 +514,6 @@
for _var in variables:
logger.debug(_var)
- @timed
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
"""
Updates model using buffer.
Mutant 466
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -596,7 +596,7 @@
:param num_sequences: Number of LSTM sequences in batch.
"""
# Do an optional burn-in for memories
- num_burn_in = int(self.burn_in_ratio * self.policy.sequence_length)
+ num_burn_in = int(self.burn_in_ratio / self.policy.sequence_length)
burn_in_mask = np.ones((self.policy.sequence_length), dtype=np.float32)
burn_in_mask[range(0, num_burn_in)] = 0
burn_in_mask = np.tile(burn_in_mask, num_sequences)
Mutant 469
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -598,7 +598,7 @@
# Do an optional burn-in for memories
num_burn_in = int(self.burn_in_ratio * self.policy.sequence_length)
burn_in_mask = np.ones((self.policy.sequence_length), dtype=np.float32)
- burn_in_mask[range(0, num_burn_in)] = 0
+ burn_in_mask[range(1, num_burn_in)] = 0
burn_in_mask = np.tile(burn_in_mask, num_sequences)
feed_dict = {
policy.batch_size_ph: num_sequences,
Mutant 470
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -598,7 +598,7 @@
# Do an optional burn-in for memories
num_burn_in = int(self.burn_in_ratio * self.policy.sequence_length)
burn_in_mask = np.ones((self.policy.sequence_length), dtype=np.float32)
- burn_in_mask[range(0, num_burn_in)] = 0
+ burn_in_mask[range(0, num_burn_in)] = 1
burn_in_mask = np.tile(burn_in_mask, num_sequences)
feed_dict = {
policy.batch_size_ph: num_sequences,
Mutant 471
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -598,7 +598,7 @@
# Do an optional burn-in for memories
num_burn_in = int(self.burn_in_ratio * self.policy.sequence_length)
burn_in_mask = np.ones((self.policy.sequence_length), dtype=np.float32)
- burn_in_mask[range(0, num_burn_in)] = 0
+ burn_in_mask[range(0, num_burn_in)] = None
burn_in_mask = np.tile(burn_in_mask, num_sequences)
feed_dict = {
policy.batch_size_ph: num_sequences,
Mutant 474
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -604,7 +604,7 @@
policy.batch_size_ph: num_sequences,
policy.sequence_length_ph: self.policy.sequence_length,
self.next_sequence_length_ph: self.policy.sequence_length,
- self.policy.mask_input: batch["masks"] * burn_in_mask,
+ self.policy.mask_input: batch["masks"] / burn_in_mask,
}
for name in self.reward_signals:
feed_dict[self.rewards_holders[name]] = batch[f"{name}_rewards"]
Mutant 488
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -619,7 +619,7 @@
if self.policy.use_vec_obs:
feed_dict[policy.vector_in] = batch["vector_obs"]
feed_dict[self.next_vector_in] = batch["next_vector_in"]
- if self.policy.vis_obs_size > 0:
+ if self.policy.vis_obs_size >= 0:
for i, _ in enumerate(policy.visual_in):
_obs = batch["visual_obs%d" % i]
feed_dict[policy.visual_in[i]] = _obs
Mutant 499
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -629,7 +629,7 @@
if self.policy.use_recurrent:
feed_dict[policy.memory_in] = [
batch["memory"][i]
- for i in range(0, len(batch["memory"]), self.policy.sequence_length)
+ for i in range(1, len(batch["memory"]), self.policy.sequence_length)
]
feed_dict[self.policy_network.memory_in] = self._make_zero_mem(
self.m_size, batch.num_experiences