ml-agents/mlagents/trainers/sac/optimizer.py
Killed 171 out of 253 mutantsSurvived
Survived mutation testing. These mutants show holes in your test suite.Mutant 80
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -12,7 +12,7 @@
from mlagents_envs.timers import timed
from mlagents.trainers.settings import TrainerSettings, SACSettings
-EPSILON = 1e-6 # Small value to avoid divide by zero
+EPSILON = 1.000001 # Small value to avoid divide by zero
logger = get_logger(__name__)
Mutant 81
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -12,7 +12,7 @@
from mlagents_envs.timers import timed
from mlagents.trainers.settings import TrainerSettings, SACSettings
-EPSILON = 1e-6 # Small value to avoid divide by zero
+EPSILON = None # Small value to avoid divide by zero
logger = get_logger(__name__)
Mutant 83
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -16,7 +16,7 @@
logger = get_logger(__name__)
-POLICY_SCOPE = ""
+POLICY_SCOPE = "XXXX"
TARGET_SCOPE = "target_network"
Mutant 84
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -16,7 +16,7 @@
logger = get_logger(__name__)
-POLICY_SCOPE = ""
+POLICY_SCOPE = None
TARGET_SCOPE = "target_network"
Mutant 85
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -17,7 +17,7 @@
logger = get_logger(__name__)
POLICY_SCOPE = ""
-TARGET_SCOPE = "target_network"
+TARGET_SCOPE = "XXtarget_networkXX"
class SACOptimizer(TFOptimizer):
Mutant 86
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -17,7 +17,7 @@
logger = get_logger(__name__)
POLICY_SCOPE = ""
-TARGET_SCOPE = "target_network"
+TARGET_SCOPE = None
class SACOptimizer(TFOptimizer):
Mutant 91
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -51,7 +51,7 @@
lr = hyperparameters.learning_rate
lr_schedule = hyperparameters.learning_rate_schedule
max_step = trainer_params.max_steps
- self.tau = hyperparameters.tau
+ self.tau = None
self.init_entcoef = hyperparameters.init_entcoef
self.policy = policy
Mutant 103
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -68,7 +68,7 @@
self.discrete_target_entropy_scale = (
0.2 # Roughly equal to e-greedy 0.05
)
- self.continuous_target_entropy_scale = 1.0
+ self.continuous_target_entropy_scale = 2.0
stream_names = list(self.reward_signals.keys())
# Use to reduce "survivor bonus" when using Curiosity or GAIL.
Mutant 106
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -76,7 +76,7 @@
_val.gamma for _val in trainer_params.reward_signals.values()
]
self.use_dones_in_backup = {
- name: tf.Variable(1.0) for name in stream_names
+ name: tf.Variable(2.0) for name in stream_names
}
self.disable_use_dones = {
name: self.use_dones_in_backup[name].assign(0.0)
Mutant 107
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -79,7 +79,7 @@
name: tf.Variable(1.0) for name in stream_names
}
self.disable_use_dones = {
- name: self.use_dones_in_backup[name].assign(0.0)
+ name: self.use_dones_in_backup[name].assign(1.0)
for name in stream_names
}
Mutant 108
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -83,7 +83,7 @@
for name in stream_names
}
- if num_layers < 1:
+ if num_layers <= 1:
num_layers = 1
self.target_init_op: List[tf.Tensor] = []
Mutant 109
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -83,7 +83,7 @@
for name in stream_names
}
- if num_layers < 1:
+ if num_layers < 2:
num_layers = 1
self.target_init_op: List[tf.Tensor] = []
Mutant 110
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -86,7 +86,7 @@
if num_layers < 1:
num_layers = 1
- self.target_init_op: List[tf.Tensor] = []
+ self.target_init_op: List[tf.Tensor] = None
self.target_update_op: List[tf.Tensor] = []
self.update_batch_policy: Optional[tf.Operation] = None
self.update_batch_value: Optional[tf.Operation] = None
Mutant 111
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -87,7 +87,7 @@
num_layers = 1
self.target_init_op: List[tf.Tensor] = []
- self.target_update_op: List[tf.Tensor] = []
+ self.target_update_op: List[tf.Tensor] = None
self.update_batch_policy: Optional[tf.Operation] = None
self.update_batch_value: Optional[tf.Operation] = None
self.update_batch_entropy: Optional[tf.Operation] = None
Mutant 112
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -88,7 +88,7 @@
self.target_init_op: List[tf.Tensor] = []
self.target_update_op: List[tf.Tensor] = []
- self.update_batch_policy: Optional[tf.Operation] = None
+ self.update_batch_policy: Optional[tf.Operation] = ""
self.update_batch_value: Optional[tf.Operation] = None
self.update_batch_entropy: Optional[tf.Operation] = None
Mutant 113
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -89,7 +89,7 @@
self.target_init_op: List[tf.Tensor] = []
self.target_update_op: List[tf.Tensor] = []
self.update_batch_policy: Optional[tf.Operation] = None
- self.update_batch_value: Optional[tf.Operation] = None
+ self.update_batch_value: Optional[tf.Operation] = ""
self.update_batch_entropy: Optional[tf.Operation] = None
self.policy_network = SACPolicyNetwork(
Mutant 114
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -90,7 +90,7 @@
self.target_update_op: List[tf.Tensor] = []
self.update_batch_policy: Optional[tf.Operation] = None
self.update_batch_value: Optional[tf.Operation] = None
- self.update_batch_entropy: Optional[tf.Operation] = None
+ self.update_batch_entropy: Optional[tf.Operation] = ""
self.policy_network = SACPolicyNetwork(
policy=self.policy,
Mutant 118
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -120,7 +120,7 @@
lr,
self.policy.global_step,
int(max_step),
- min_value=1e-10,
+ min_value=1.0000000001,
)
self._create_losses(
self.policy_network.q1_heads,
Mutant 120
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -149,7 +149,7 @@
self.policy.initialize_or_load()
self.stats_name_to_update_name = {
- "Losses/Value Loss": "value_loss",
+ "XXLosses/Value LossXX": "value_loss",
"Losses/Policy Loss": "policy_loss",
"Losses/Q1 Loss": "q1_loss",
"Losses/Q2 Loss": "q2_loss",
Mutant 122
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -150,7 +150,7 @@
self.stats_name_to_update_name = {
"Losses/Value Loss": "value_loss",
- "Losses/Policy Loss": "policy_loss",
+ "XXLosses/Policy LossXX": "policy_loss",
"Losses/Q1 Loss": "q1_loss",
"Losses/Q2 Loss": "q2_loss",
"Policy/Entropy Coeff": "entropy_coef",
Mutant 124
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -151,7 +151,7 @@
self.stats_name_to_update_name = {
"Losses/Value Loss": "value_loss",
"Losses/Policy Loss": "policy_loss",
- "Losses/Q1 Loss": "q1_loss",
+ "XXLosses/Q1 LossXX": "q1_loss",
"Losses/Q2 Loss": "q2_loss",
"Policy/Entropy Coeff": "entropy_coef",
"Policy/Learning Rate": "learning_rate",
Mutant 126
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -152,7 +152,7 @@
"Losses/Value Loss": "value_loss",
"Losses/Policy Loss": "policy_loss",
"Losses/Q1 Loss": "q1_loss",
- "Losses/Q2 Loss": "q2_loss",
+ "XXLosses/Q2 LossXX": "q2_loss",
"Policy/Entropy Coeff": "entropy_coef",
"Policy/Learning Rate": "learning_rate",
}
Mutant 128
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -153,7 +153,7 @@
"Losses/Policy Loss": "policy_loss",
"Losses/Q1 Loss": "q1_loss",
"Losses/Q2 Loss": "q2_loss",
- "Policy/Entropy Coeff": "entropy_coef",
+ "XXPolicy/Entropy CoeffXX": "entropy_coef",
"Policy/Learning Rate": "learning_rate",
}
Mutant 130
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -154,7 +154,7 @@
"Losses/Q1 Loss": "q1_loss",
"Losses/Q2 Loss": "q2_loss",
"Policy/Entropy Coeff": "entropy_coef",
- "Policy/Learning Rate": "learning_rate",
+ "XXPolicy/Learning RateXX": "learning_rate",
}
self.update_dict = {
Mutant 137
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -163,7 +163,7 @@
"q1_loss": self.q1_loss,
"q2_loss": self.q2_loss,
"entropy_coef": self.ent_coef,
- "update_batch": self.update_batch_policy,
+ "XXupdate_batchXX": self.update_batch_policy,
"update_value": self.update_batch_value,
"update_entropy": self.update_batch_entropy,
"learning_rate": self.learning_rate,
Mutant 138
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -164,7 +164,7 @@
"q2_loss": self.q2_loss,
"entropy_coef": self.ent_coef,
"update_batch": self.update_batch_policy,
- "update_value": self.update_batch_value,
+ "XXupdate_valueXX": self.update_batch_value,
"update_entropy": self.update_batch_entropy,
"learning_rate": self.learning_rate,
}
Mutant 139
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -165,7 +165,7 @@
"entropy_coef": self.ent_coef,
"update_batch": self.update_batch_policy,
"update_value": self.update_batch_value,
- "update_entropy": self.update_batch_entropy,
+ "XXupdate_entropyXX": self.update_batch_entropy,
"learning_rate": self.learning_rate,
}
Mutant 141
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -174,7 +174,7 @@
Assign the higher-level SACModel's inputs and outputs to those of its policy or
target network.
"""
- self.vector_in = self.policy.vector_in
+ self.vector_in = None
self.visual_in = self.policy.visual_in
self.next_vector_in = self.target_network.vector_in
self.next_visual_in = self.target_network.visual_in
Mutant 142
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -175,7 +175,7 @@
target network.
"""
self.vector_in = self.policy.vector_in
- self.visual_in = self.policy.visual_in
+ self.visual_in = None
self.next_vector_in = self.target_network.vector_in
self.next_visual_in = self.target_network.visual_in
self.sequence_length_ph = self.policy.sequence_length_ph
Mutant 145
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -178,7 +178,7 @@
self.visual_in = self.policy.visual_in
self.next_vector_in = self.target_network.vector_in
self.next_visual_in = self.target_network.visual_in
- self.sequence_length_ph = self.policy.sequence_length_ph
+ self.sequence_length_ph = None
self.next_sequence_length_ph = self.target_network.sequence_length_ph
if not self.policy.use_continuous_act:
self.action_masks = self.policy_network.action_masks
Mutant 147
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -180,7 +180,7 @@
self.next_visual_in = self.target_network.visual_in
self.sequence_length_ph = self.policy.sequence_length_ph
self.next_sequence_length_ph = self.target_network.sequence_length_ph
- if not self.policy.use_continuous_act:
+ if self.policy.use_continuous_act:
self.action_masks = self.policy_network.action_masks
else:
self.output_pre = self.policy_network.output_pre
Mutant 148
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -181,7 +181,7 @@
self.sequence_length_ph = self.policy.sequence_length_ph
self.next_sequence_length_ph = self.target_network.sequence_length_ph
if not self.policy.use_continuous_act:
- self.action_masks = self.policy_network.action_masks
+ self.action_masks = None
else:
self.output_pre = self.policy_network.output_pre
Mutant 149
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -183,7 +183,7 @@
if not self.policy.use_continuous_act:
self.action_masks = self.policy_network.action_masks
else:
- self.output_pre = self.policy_network.output_pre
+ self.output_pre = None
# Don't use value estimate during inference.
self.value = tf.identity(
Mutant 150
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -187,7 +187,7 @@
# Don't use value estimate during inference.
self.value = tf.identity(
- self.policy_network.value, name="value_estimate_unused"
+ self.policy_network.value, name="XXvalue_estimate_unusedXX"
)
self.value_heads = self.policy_network.value_heads
self.dones_holder = tf.placeholder(
Mutant 152
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -191,7 +191,7 @@
)
self.value_heads = self.policy_network.value_heads
self.dones_holder = tf.placeholder(
- shape=[None], dtype=tf.float32, name="dones_holder"
+ shape=[None], dtype=tf.float32, name="XXdones_holderXX"
)
if self.policy.use_recurrent:
Mutant 155
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -197,7 +197,7 @@
if self.policy.use_recurrent:
self.memory_in = self.policy_network.memory_in
self.memory_out = self.policy_network.memory_out
- if not self.policy.use_continuous_act:
+ if self.policy.use_continuous_act:
self.prev_action = self.policy_network.prev_action
self.next_memory_in = self.target_network.memory_in
Mutant 156
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -198,7 +198,7 @@
self.memory_in = self.policy_network.memory_in
self.memory_out = self.policy_network.memory_out
if not self.policy.use_continuous_act:
- self.prev_action = self.policy_network.prev_action
+ self.prev_action = None
self.next_memory_in = self.target_network.memory_in
def _create_losses(
Mutant 157
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -199,7 +199,7 @@
self.memory_out = self.policy_network.memory_out
if not self.policy.use_continuous_act:
self.prev_action = self.policy_network.prev_action
- self.next_memory_in = self.target_network.memory_in
+ self.next_memory_in = None
def _create_losses(
self,
Mutant 158
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -208,7 +208,7 @@
lr: tf.Tensor,
max_step: int,
stream_names: List[str],
- discrete: bool = False,
+ discrete: bool = True,
) -> None:
"""
Creates training-specific Tensorflow ops for SAC models.
Mutant 159
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -222,7 +222,7 @@
if discrete:
self.target_entropy = [
- self.discrete_target_entropy_scale * np.log(i).astype(np.float32)
+ self.discrete_target_entropy_scale / np.log(i).astype(np.float32)
for i in self.act_size
]
discrete_action_probs = tf.exp(self.policy.all_log_probs)
Mutant 164
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -229,7 +229,7 @@
per_action_entropy = discrete_action_probs * self.policy.all_log_probs
else:
self.target_entropy = (
- -1
+ -2
* self.continuous_target_entropy_scale
* np.prod(self.act_size[0]).astype(np.float32)
)
Mutant 165
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -230,7 +230,7 @@
else:
self.target_entropy = (
-1
- * self.continuous_target_entropy_scale
+ / self.continuous_target_entropy_scale
* np.prod(self.act_size[0]).astype(np.float32)
)
Mutant 166
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -231,7 +231,7 @@
self.target_entropy = (
-1
* self.continuous_target_entropy_scale
- * np.prod(self.act_size[0]).astype(np.float32)
+ / np.prod(self.act_size[0]).astype(np.float32)
)
self.rewards_holders = {}
Mutant 184
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -278,7 +278,7 @@
q1_losses = []
q2_losses = []
# Multiple q losses per stream
- expanded_dones = tf.expand_dims(self.dones_holder, axis=-1)
+ expanded_dones = tf.expand_dims(self.dones_holder, axis=+1)
for i, name in enumerate(stream_names):
_expanded_rewards = tf.expand_dims(self.rewards_holders[name], axis=-1)
Mutant 185
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -278,7 +278,7 @@
q1_losses = []
q2_losses = []
# Multiple q losses per stream
- expanded_dones = tf.expand_dims(self.dones_holder, axis=-1)
+ expanded_dones = tf.expand_dims(self.dones_holder, axis=-2)
for i, name in enumerate(stream_names):
_expanded_rewards = tf.expand_dims(self.rewards_holders[name], axis=-1)
Mutant 187
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -280,7 +280,7 @@
# Multiple q losses per stream
expanded_dones = tf.expand_dims(self.dones_holder, axis=-1)
for i, name in enumerate(stream_names):
- _expanded_rewards = tf.expand_dims(self.rewards_holders[name], axis=-1)
+ _expanded_rewards = tf.expand_dims(self.rewards_holders[name], axis=+1)
q_backup = tf.stop_gradient(
_expanded_rewards
Mutant 190
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -284,7 +284,7 @@
q_backup = tf.stop_gradient(
_expanded_rewards
- + (1.0 - self.use_dones_in_backup[name] * expanded_dones)
+ - (1.0 - self.use_dones_in_backup[name] * expanded_dones)
* self.gammas[i]
* self.target_network.value_heads[name]
)
Mutant 194
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -285,7 +285,7 @@
q_backup = tf.stop_gradient(
_expanded_rewards
+ (1.0 - self.use_dones_in_backup[name] * expanded_dones)
- * self.gammas[i]
+ / self.gammas[i]
* self.target_network.value_heads[name]
)
Mutant 199
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -300,7 +300,7 @@
# Reduce each branch into scalar
branched_q1_stream = [
- tf.reduce_sum(_branch, axis=1, keep_dims=True)
+ tf.reduce_sum(_branch, axis=1, keep_dims=False)
for _branch in branched_q1_stream
]
branched_q2_stream = [
Mutant 208
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -315,7 +315,7 @@
q1_stream = q1_streams[name]
q2_stream = q2_streams[name]
- _q1_loss = 0.5 * tf.reduce_mean(
+ _q1_loss = 1.5 * tf.reduce_mean(
tf.to_float(self.policy.mask)
* tf.squared_difference(q_backup, q1_stream)
)
Mutant 211
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -320,7 +320,7 @@
* tf.squared_difference(q_backup, q1_stream)
)
- _q2_loss = 0.5 * tf.reduce_mean(
+ _q2_loss = 1.5 * tf.reduce_mean(
tf.to_float(self.policy.mask)
* tf.squared_difference(q_backup, q2_stream)
)
Mutant 216
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -335,7 +335,7 @@
if discrete:
# Create a log_ent_coef for each branch
self.log_ent_coef = tf.get_variable(
- "log_ent_coef",
+ "XXlog_ent_coefXX",
dtype=tf.float32,
initializer=np.log([self.init_entcoef] * len(self.act_size)).astype(
np.float32
Mutant 218
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -340,7 +340,7 @@
initializer=np.log([self.init_entcoef] * len(self.act_size)).astype(
np.float32
),
- trainable=True,
+ trainable=False,
)
else:
self.log_ent_coef = tf.get_variable(
Mutant 219
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -344,7 +344,7 @@
)
else:
self.log_ent_coef = tf.get_variable(
- "log_ent_coef",
+ "XXlog_ent_coefXX",
dtype=tf.float32,
initializer=np.log(self.init_entcoef).astype(np.float32),
trainable=True,
Mutant 220
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -347,7 +347,7 @@
"log_ent_coef",
dtype=tf.float32,
initializer=np.log(self.init_entcoef).astype(np.float32),
- trainable=True,
+ trainable=False,
)
self.ent_coef = tf.exp(self.log_ent_coef)
Mutant 224
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -358,7 +358,7 @@
)
branched_ent_sums = tf.stack(
[
- tf.reduce_sum(_lp, axis=1, keep_dims=True) + _te
+ tf.reduce_sum(_lp, axis=1, keep_dims=True) - _te
for _lp, _te in zip(branched_per_action_ent, self.target_entropy)
],
axis=1,
Mutant 228
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -367,7 +367,7 @@
tf.to_float(self.policy.mask)
* tf.reduce_mean(
self.log_ent_coef
- * tf.squeeze(tf.stop_gradient(branched_ent_sums), axis=2),
+ / tf.squeeze(tf.stop_gradient(branched_ent_sums), axis=2),
axis=1,
)
)
Mutant 235
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -381,7 +381,7 @@
branched_policy_loss = tf.stack(
[
- tf.reduce_sum(self.ent_coef[i] * _lp - _qt, axis=1, keep_dims=True)
+ tf.reduce_sum(self.ent_coef[i] * _lp - _qt, axis=1, keep_dims=False)
for i, (_lp, _qt) in enumerate(
zip(branched_per_action_ent, branched_q_term)
)
Mutant 239
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -394,7 +394,7 @@
# Do vbackup entropy bonus per branch as well.
branched_ent_bonus = tf.stack(
[
- tf.reduce_sum(self.ent_coef[i] * _lp, axis=1, keep_dims=True)
+ tf.reduce_sum(self.ent_coef[i] * _lp, axis=1, keep_dims=False)
for i, _lp in enumerate(branched_per_action_ent)
]
)
Mutant 241
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -402,7 +402,7 @@
for name in stream_names:
v_backup = tf.stop_gradient(
self.min_policy_qs[name]
- - tf.reduce_mean(branched_ent_bonus, axis=0)
+ + tf.reduce_mean(branched_ent_bonus, axis=0)
)
value_losses.append(
0.5
Mutant 243
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -405,7 +405,7 @@
- tf.reduce_mean(branched_ent_bonus, axis=0)
)
value_losses.append(
- 0.5
+ 1.5
* tf.reduce_mean(
tf.to_float(self.policy.mask)
* tf.squared_difference(
Mutant 248
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -418,7 +418,7 @@
self.entropy_loss = -tf.reduce_mean(
self.log_ent_coef
* tf.to_float(self.policy.mask)
- * tf.stop_gradient(
+ / tf.stop_gradient(
tf.reduce_sum(
self.policy.all_log_probs + self.target_entropy,
axis=1,
Mutant 251
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -422,7 +422,7 @@
tf.reduce_sum(
self.policy.all_log_probs + self.target_entropy,
axis=1,
- keep_dims=True,
+ keep_dims=False,
)
)
)
Mutant 257
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -438,7 +438,7 @@
for name in stream_names:
v_backup = tf.stop_gradient(
self.min_policy_qs[name]
- - tf.reduce_sum(self.ent_coef * self.policy.all_log_probs, axis=1)
+ + tf.reduce_sum(self.ent_coef * self.policy.all_log_probs, axis=1)
)
value_losses.append(
0.5
Mutant 258
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -438,7 +438,7 @@
for name in stream_names:
v_backup = tf.stop_gradient(
self.min_policy_qs[name]
- - tf.reduce_sum(self.ent_coef * self.policy.all_log_probs, axis=1)
+ - tf.reduce_sum(self.ent_coef / self.policy.all_log_probs, axis=1)
)
value_losses.append(
0.5
Mutant 260
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -441,7 +441,7 @@
- tf.reduce_sum(self.ent_coef * self.policy.all_log_probs, axis=1)
)
value_losses.append(
- 0.5
+ 1.5
* tf.reduce_mean(
tf.to_float(self.policy.mask)
* tf.squared_difference(
Mutant 267
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -453,7 +453,7 @@
self.total_value_loss = self.q1_loss + self.q2_loss + self.value_loss
- self.entropy = self.policy_network.entropy
+ self.entropy = None
def _create_sac_optimizer_ops(self) -> None:
"""
Mutant 268
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -461,7 +461,7 @@
the policy, value, and entropy updates, as well as the target network update.
"""
policy_optimizer = self.create_optimizer_op(
- learning_rate=self.learning_rate, name="sac_policy_opt"
+ learning_rate=self.learning_rate, name="XXsac_policy_optXX"
)
entropy_optimizer = self.create_optimizer_op(
learning_rate=self.learning_rate, name="sac_entropy_opt"
Mutant 269
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -464,7 +464,7 @@
learning_rate=self.learning_rate, name="sac_policy_opt"
)
entropy_optimizer = self.create_optimizer_op(
- learning_rate=self.learning_rate, name="sac_entropy_opt"
+ learning_rate=self.learning_rate, name="XXsac_entropy_optXX"
)
value_optimizer = self.create_optimizer_op(
learning_rate=self.learning_rate, name="sac_value_opt"
Mutant 270
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -467,7 +467,7 @@
learning_rate=self.learning_rate, name="sac_entropy_opt"
)
value_optimizer = self.create_optimizer_op(
- learning_rate=self.learning_rate, name="sac_value_opt"
+ learning_rate=self.learning_rate, name="XXsac_value_optXX"
)
self.target_update_op = [
Mutant 276
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -476,7 +476,7 @@
self.target_network.value_vars, self.policy_network.value_vars
)
]
- logger.debug("value_vars")
+ logger.debug("XXvalue_varsXX")
self.print_all_vars(self.policy_network.value_vars)
logger.debug("targvalue_vars")
self.print_all_vars(self.target_network.value_vars)
Mutant 277
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -478,7 +478,7 @@
]
logger.debug("value_vars")
self.print_all_vars(self.policy_network.value_vars)
- logger.debug("targvalue_vars")
+ logger.debug("XXtargvalue_varsXX")
self.print_all_vars(self.target_network.value_vars)
logger.debug("critic_vars")
self.print_all_vars(self.policy_network.critic_vars)
Mutant 278
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -480,7 +480,7 @@
self.print_all_vars(self.policy_network.value_vars)
logger.debug("targvalue_vars")
self.print_all_vars(self.target_network.value_vars)
- logger.debug("critic_vars")
+ logger.debug("XXcritic_varsXX")
self.print_all_vars(self.policy_network.critic_vars)
logger.debug("q_vars")
self.print_all_vars(self.policy_network.q_vars)
Mutant 279
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -482,7 +482,7 @@
self.print_all_vars(self.target_network.value_vars)
logger.debug("critic_vars")
self.print_all_vars(self.policy_network.critic_vars)
- logger.debug("q_vars")
+ logger.debug("XXq_varsXX")
self.print_all_vars(self.policy_network.q_vars)
logger.debug("policy_vars")
policy_vars = self.policy.get_trainable_variables()
Mutant 280
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -484,7 +484,7 @@
self.print_all_vars(self.policy_network.critic_vars)
logger.debug("q_vars")
self.print_all_vars(self.policy_network.q_vars)
- logger.debug("policy_vars")
+ logger.debug("XXpolicy_varsXX")
policy_vars = self.policy.get_trainable_variables()
self.print_all_vars(policy_vars)
Mutant 294
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -596,7 +596,7 @@
:param num_sequences: Number of LSTM sequences in batch.
"""
# Do an optional burn-in for memories
- num_burn_in = int(self.burn_in_ratio * self.policy.sequence_length)
+ num_burn_in = int(self.burn_in_ratio / self.policy.sequence_length)
burn_in_mask = np.ones((self.policy.sequence_length), dtype=np.float32)
burn_in_mask[range(0, num_burn_in)] = 0
burn_in_mask = np.tile(burn_in_mask, num_sequences)
Mutant 297
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -598,7 +598,7 @@
# Do an optional burn-in for memories
num_burn_in = int(self.burn_in_ratio * self.policy.sequence_length)
burn_in_mask = np.ones((self.policy.sequence_length), dtype=np.float32)
- burn_in_mask[range(0, num_burn_in)] = 0
+ burn_in_mask[range(1, num_burn_in)] = 0
burn_in_mask = np.tile(burn_in_mask, num_sequences)
feed_dict = {
policy.batch_size_ph: num_sequences,
Mutant 298
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -598,7 +598,7 @@
# Do an optional burn-in for memories
num_burn_in = int(self.burn_in_ratio * self.policy.sequence_length)
burn_in_mask = np.ones((self.policy.sequence_length), dtype=np.float32)
- burn_in_mask[range(0, num_burn_in)] = 0
+ burn_in_mask[range(0, num_burn_in)] = 1
burn_in_mask = np.tile(burn_in_mask, num_sequences)
feed_dict = {
policy.batch_size_ph: num_sequences,
Mutant 299
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -598,7 +598,7 @@
# Do an optional burn-in for memories
num_burn_in = int(self.burn_in_ratio * self.policy.sequence_length)
burn_in_mask = np.ones((self.policy.sequence_length), dtype=np.float32)
- burn_in_mask[range(0, num_burn_in)] = 0
+ burn_in_mask[range(0, num_burn_in)] = None
burn_in_mask = np.tile(burn_in_mask, num_sequences)
feed_dict = {
policy.batch_size_ph: num_sequences,
Mutant 302
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -604,7 +604,7 @@
policy.batch_size_ph: num_sequences,
policy.sequence_length_ph: self.policy.sequence_length,
self.next_sequence_length_ph: self.policy.sequence_length,
- self.policy.mask_input: batch["masks"] * burn_in_mask,
+ self.policy.mask_input: batch["masks"] / burn_in_mask,
}
for name in self.reward_signals:
feed_dict[self.rewards_holders[name]] = batch[f"{name}_rewards"]
Mutant 316
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -619,7 +619,7 @@
if self.policy.use_vec_obs:
feed_dict[policy.vector_in] = batch["vector_obs"]
feed_dict[self.next_vector_in] = batch["next_vector_in"]
- if self.policy.vis_obs_size > 0:
+ if self.policy.vis_obs_size >= 0:
for i, _ in enumerate(policy.visual_in):
_obs = batch["visual_obs%d" % i]
feed_dict[policy.visual_in[i]] = _obs
Mutant 327
--- ml-agents/mlagents/trainers/sac/optimizer.py
+++ ml-agents/mlagents/trainers/sac/optimizer.py
@@ -629,7 +629,7 @@
if self.policy.use_recurrent:
feed_dict[policy.memory_in] = [
batch["memory"][i]
- for i in range(0, len(batch["memory"]), self.policy.sequence_length)
+ for i in range(1, len(batch["memory"]), self.policy.sequence_length)
]
feed_dict[self.policy_network.memory_in] = self._make_zero_mem(
self.m_size, batch.num_experiences