ml-agents/mlagents/trainers/sac/trainer.py
Killed 34 out of 79 mutantsTimeouts
Mutants that made the test suite take a lot longer so the tests were killed.Mutant 297
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -237,7 +237,7 @@
batch_update_stats: Dict[str, list] = defaultdict(list)
while (
self.step - self.hyperparameters.buffer_init_steps
- ) / self.update_steps > self.steps_per_update:
+ ) * self.update_steps > self.steps_per_update:
logger.debug(f"Updating SAC policy at step {self.step}")
buffer = self.update_buffer
if self.update_buffer.num_experiences >= self.hyperparameters.batch_size:
Mutant 300
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -240,7 +240,7 @@
) / self.update_steps > self.steps_per_update:
logger.debug(f"Updating SAC policy at step {self.step}")
buffer = self.update_buffer
- if self.update_buffer.num_experiences >= self.hyperparameters.batch_size:
+ if self.update_buffer.num_experiences > self.hyperparameters.batch_size:
sampled_minibatch = buffer.sample_mini_batch(
self.hyperparameters.batch_size,
sequence_length=self.policy.sequence_length,
Mutant 302
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -255,7 +255,7 @@
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
- self.update_steps += 1
+ self.update_steps = 1
for stat, stat_list in batch_update_stats.items():
self._stats_reporter.add_stat(stat, np.mean(stat_list))
Mutant 315
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -291,7 +291,7 @@
batch_update_stats: Dict[str, list] = defaultdict(list)
while (
self.step - self.hyperparameters.buffer_init_steps
- ) / self.reward_signal_update_steps > self.reward_signal_steps_per_update:
+ ) * self.reward_signal_update_steps > self.reward_signal_steps_per_update:
# Get minibatches for reward signal update if needed
reward_signal_minibatches = {}
for name, signal in self.optimizer.reward_signals.items():
Mutant 318
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -307,7 +307,7 @@
)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
- self.reward_signal_update_steps += 1
+ self.reward_signal_update_steps = 1
for stat, stat_list in batch_update_stats.items():
self._stats_reporter.add_stat(stat, np.mean(stat_list))
Survived
Survived mutation testing. These mutants show holes in your test suite.Mutant 255
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -23,7 +23,7 @@
logger = get_logger(__name__)
-BUFFER_TRUNCATE_PERCENT = 0.8
+BUFFER_TRUNCATE_PERCENT = 1.8
class SACTrainer(RLTrainer):
Mutant 257
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -56,7 +56,7 @@
brain_name, trainer_settings, training, artifact_path, reward_buff_cap
)
- self.load = load
+ self.load = None
self.seed = seed
self.policy: Policy = None # type: ignore
self.optimizer: SACOptimizer = None # type: ignore
Mutant 259
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -58,7 +58,7 @@
self.load = load
self.seed = seed
- self.policy: Policy = None # type: ignore
+ self.policy: Policy = "" # type: ignore
self.optimizer: SACOptimizer = None # type: ignore
self.hyperparameters: SACSettings = cast(
SACSettings, trainer_settings.hyperparameters
Mutant 260
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -59,7 +59,7 @@
self.load = load
self.seed = seed
self.policy: Policy = None # type: ignore
- self.optimizer: SACOptimizer = None # type: ignore
+ self.optimizer: SACOptimizer = "" # type: ignore
self.hyperparameters: SACSettings = cast(
SACSettings, trainer_settings.hyperparameters
)
Mutant 261
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -63,7 +63,7 @@
self.hyperparameters: SACSettings = cast(
SACSettings, trainer_settings.hyperparameters
)
- self.step = 0
+ self.step = 1
# Don't divide by zero
self.update_steps = 1
Mutant 262
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -63,7 +63,7 @@
self.hyperparameters: SACSettings = cast(
SACSettings, trainer_settings.hyperparameters
)
- self.step = 0
+ self.step = None
# Don't divide by zero
self.update_steps = 1
Mutant 263
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -66,7 +66,7 @@
self.step = 0
# Don't divide by zero
- self.update_steps = 1
+ self.update_steps = 2
self.reward_signal_update_steps = 1
self.steps_per_update = self.hyperparameters.steps_per_update
Mutant 264
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -66,7 +66,7 @@
self.step = 0
# Don't divide by zero
- self.update_steps = 1
+ self.update_steps = None
self.reward_signal_update_steps = 1
self.steps_per_update = self.hyperparameters.steps_per_update
Mutant 265
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -67,7 +67,7 @@
# Don't divide by zero
self.update_steps = 1
- self.reward_signal_update_steps = 1
+ self.reward_signal_update_steps = 2
self.steps_per_update = self.hyperparameters.steps_per_update
self.reward_signal_steps_per_update = (
Mutant 266
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -67,7 +67,7 @@
# Don't divide by zero
self.update_steps = 1
- self.reward_signal_update_steps = 1
+ self.reward_signal_update_steps = None
self.steps_per_update = self.hyperparameters.steps_per_update
self.reward_signal_steps_per_update = (
Mutant 268
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -74,7 +74,7 @@
self.hyperparameters.reward_signal_steps_per_update
)
- self.checkpoint_replay_buffer = self.hyperparameters.save_replay_buffer
+ self.checkpoint_replay_buffer = None
def _checkpoint(self) -> NNCheckpoint:
"""
Mutant 273
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -124,7 +124,7 @@
"""
super()._process_trajectory(trajectory)
last_step = trajectory.steps[-1]
- agent_id = trajectory.agent_id # All the agents should have the same ID
+ agent_id = None # All the agents should have the same ID
agent_buffer_trajectory = trajectory.to_agentbuffer()
Mutant 277
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -133,7 +133,7 @@
self.policy.update_normalization(agent_buffer_trajectory["vector_obs"])
# Evaluate all reward functions for reporting purposes
- self.collected_rewards["environment"][agent_id] += np.sum(
+ self.collected_rewards["environment"][agent_id] = np.sum(
agent_buffer_trajectory["environment_rewards"]
)
for name, reward_signal in self.optimizer.reward_signals.items():
Mutant 278
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -133,7 +133,7 @@
self.policy.update_normalization(agent_buffer_trajectory["vector_obs"])
# Evaluate all reward functions for reporting purposes
- self.collected_rewards["environment"][agent_id] += np.sum(
+ self.collected_rewards["environment"][agent_id] -= np.sum(
agent_buffer_trajectory["environment_rewards"]
)
for name, reward_signal in self.optimizer.reward_signals.items():
Mutant 280
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -141,7 +141,7 @@
agent_buffer_trajectory
).scaled_reward
# Report the reward signals
- self.collected_rewards[name][agent_id] += np.sum(evaluate_result)
+ self.collected_rewards[name][agent_id] = np.sum(evaluate_result)
# Get all value estimates for reporting purposes
value_estimates, _ = self.optimizer.get_trajectory_value_estimates(
Mutant 281
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -141,7 +141,7 @@
agent_buffer_trajectory
).scaled_reward
# Report the reward signals
- self.collected_rewards[name][agent_id] += np.sum(evaluate_result)
+ self.collected_rewards[name][agent_id] -= np.sum(evaluate_result)
# Get all value estimates for reporting purposes
value_estimates, _ = self.optimizer.get_trajectory_value_estimates(
Mutant 282
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -178,7 +178,7 @@
:return: A boolean corresponding to whether or not _update_policy() can be run
"""
return (
- self.update_buffer.num_experiences >= self.hyperparameters.batch_size
+ self.update_buffer.num_experiences > self.hyperparameters.batch_size
and self.step >= self.hyperparameters.buffer_init_steps
)
Mutant 283
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -179,7 +179,7 @@
"""
return (
self.update_buffer.num_experiences >= self.hyperparameters.batch_size
- and self.step >= self.hyperparameters.buffer_init_steps
+ and self.step > self.hyperparameters.buffer_init_steps
)
@timed
Mutant 284
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -178,8 +178,7 @@
:return: A boolean corresponding to whether or not _update_policy() can be run
"""
return (
- self.update_buffer.num_experiences >= self.hyperparameters.batch_size
- and self.step >= self.hyperparameters.buffer_init_steps
+ self.update_buffer.num_experiences >= self.hyperparameters.batch_size or self.step >= self.hyperparameters.buffer_init_steps
)
@timed
Mutant 285
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -182,7 +182,6 @@
and self.step >= self.hyperparameters.buffer_init_steps
)
- @timed
def _update_policy(self) -> bool:
"""
Update the SAC policy and reward signals. The reward signal generators are updated using different mini batches.
Mutant 289
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -205,7 +205,7 @@
self.load,
tanh_squash=True,
reparameterize=True,
- create_tf_graph=False,
+ create_tf_graph=True,
)
# Load the replay buffer if load
if self.load and self.checkpoint_replay_buffer:
Mutant 290
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -208,7 +208,7 @@
create_tf_graph=False,
)
# Load the replay buffer if load
- if self.load and self.checkpoint_replay_buffer:
+ if self.load or self.checkpoint_replay_buffer:
try:
self.load_replay_buffer()
except (AttributeError, FileNotFoundError):
Mutant 291
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -228,7 +228,7 @@
Uses update_buffer to update the policy. We sample the update_buffer and update
until the steps_per_update ratio is met.
"""
- has_updated = False
+ has_updated = True
self.cumulative_returns_since_policy_update.clear()
n_sequences = max(
int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
Mutant 292
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -228,7 +228,7 @@
Uses update_buffer to update the policy. We sample the update_buffer and update
until the steps_per_update ratio is met.
"""
- has_updated = False
+ has_updated = None
self.cumulative_returns_since_policy_update.clear()
n_sequences = max(
int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
Mutant 296
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -236,7 +236,7 @@
batch_update_stats: Dict[str, list] = defaultdict(list)
while (
- self.step - self.hyperparameters.buffer_init_steps
+ self.step + self.hyperparameters.buffer_init_steps
) / self.update_steps > self.steps_per_update:
logger.debug(f"Updating SAC policy at step {self.step}")
buffer = self.update_buffer
Mutant 298
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -237,7 +237,7 @@
batch_update_stats: Dict[str, list] = defaultdict(list)
while (
self.step - self.hyperparameters.buffer_init_steps
- ) / self.update_steps > self.steps_per_update:
+ ) / self.update_steps >= self.steps_per_update:
logger.debug(f"Updating SAC policy at step {self.step}")
buffer = self.update_buffer
if self.update_buffer.num_experiences >= self.hyperparameters.batch_size:
Mutant 305
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -259,7 +259,7 @@
for stat, stat_list in batch_update_stats.items():
self._stats_reporter.add_stat(stat, np.mean(stat_list))
- has_updated = True
+ has_updated = False
if self.optimizer.bc_module:
update_stats = self.optimizer.bc_module.update()
Mutant 306
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -259,7 +259,7 @@
for stat, stat_list in batch_update_stats.items():
self._stats_reporter.add_stat(stat, np.mean(stat_list))
- has_updated = True
+ has_updated = None
if self.optimizer.bc_module:
update_stats = self.optimizer.bc_module.update()
Mutant 308
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -268,7 +268,7 @@
# Truncate update buffer if neccessary. Truncate more than we need to to avoid truncating
# a large buffer at each update.
- if self.update_buffer.num_experiences > self.hyperparameters.buffer_size:
+ if self.update_buffer.num_experiences >= self.hyperparameters.buffer_size:
self.update_buffer.truncate(
int(self.hyperparameters.buffer_size * BUFFER_TRUNCATE_PERCENT)
)
Mutant 309
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -270,7 +270,7 @@
# a large buffer at each update.
if self.update_buffer.num_experiences > self.hyperparameters.buffer_size:
self.update_buffer.truncate(
- int(self.hyperparameters.buffer_size * BUFFER_TRUNCATE_PERCENT)
+ int(self.hyperparameters.buffer_size / BUFFER_TRUNCATE_PERCENT)
)
return has_updated
Mutant 311
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -286,7 +286,7 @@
"""
buffer = self.update_buffer
n_sequences = max(
- int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
+ int(self.hyperparameters.batch_size * self.policy.sequence_length), 1
)
batch_update_stats: Dict[str, list] = defaultdict(list)
while (
Mutant 312
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -286,7 +286,7 @@
"""
buffer = self.update_buffer
n_sequences = max(
- int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
+ int(self.hyperparameters.batch_size / self.policy.sequence_length), 2
)
batch_update_stats: Dict[str, list] = defaultdict(list)
while (
Mutant 314
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -290,7 +290,7 @@
)
batch_update_stats: Dict[str, list] = defaultdict(list)
while (
- self.step - self.hyperparameters.buffer_init_steps
+ self.step + self.hyperparameters.buffer_init_steps
) / self.reward_signal_update_steps > self.reward_signal_steps_per_update:
# Get minibatches for reward signal update if needed
reward_signal_minibatches = {}
Mutant 316
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -291,7 +291,7 @@
batch_update_stats: Dict[str, list] = defaultdict(list)
while (
self.step - self.hyperparameters.buffer_init_steps
- ) / self.reward_signal_update_steps > self.reward_signal_steps_per_update:
+ ) / self.reward_signal_update_steps >= self.reward_signal_steps_per_update:
# Get minibatches for reward signal update if needed
reward_signal_minibatches = {}
for name, signal in self.optimizer.reward_signals.items():
Mutant 319
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -307,7 +307,7 @@
)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
- self.reward_signal_update_steps += 1
+ self.reward_signal_update_steps -= 1
for stat, stat_list in batch_update_stats.items():
self._stats_reporter.add_stat(stat, np.mean(stat_list))
Mutant 320
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -307,7 +307,7 @@
)
for stat_name, value in update_stats.items():
batch_update_stats[stat_name].append(value)
- self.reward_signal_update_steps += 1
+ self.reward_signal_update_steps += 2
for stat, stat_list in batch_update_stats.items():
self._stats_reporter.add_stat(stat, np.mean(stat_list))
Mutant 324
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -332,7 +332,7 @@
self.policies[parsed_behavior_id.behavior_id] = policy
self.optimizer = self.create_sac_optimizer()
for _reward_signal in self.optimizer.reward_signals.keys():
- self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
+ self.collected_rewards[_reward_signal] = defaultdict(lambda: 1)
# Needed to resume loads properly
self.step = policy.get_current_step()
# Assume steps were updated at the correct ratio before
Mutant 329
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -336,7 +336,7 @@
# Needed to resume loads properly
self.step = policy.get_current_step()
# Assume steps were updated at the correct ratio before
- self.update_steps = int(max(1, self.step / self.steps_per_update))
+ self.update_steps = int(max(1, self.step * self.steps_per_update))
self.reward_signal_update_steps = int(
max(1, self.step / self.reward_signal_steps_per_update)
)
Mutant 331
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -338,7 +338,7 @@
# Assume steps were updated at the correct ratio before
self.update_steps = int(max(1, self.step / self.steps_per_update))
self.reward_signal_update_steps = int(
- max(1, self.step / self.reward_signal_steps_per_update)
+ max(2, self.step / self.reward_signal_steps_per_update)
)
def get_policy(self, name_behavior_id: str) -> Policy:
Mutant 332
--- ml-agents/mlagents/trainers/sac/trainer.py
+++ ml-agents/mlagents/trainers/sac/trainer.py
@@ -338,7 +338,7 @@
# Assume steps were updated at the correct ratio before
self.update_steps = int(max(1, self.step / self.steps_per_update))
self.reward_signal_update_steps = int(
- max(1, self.step / self.reward_signal_steps_per_update)
+ max(1, self.step * self.reward_signal_steps_per_update)
)
def get_policy(self, name_behavior_id: str) -> Policy: