ml-agents/mlagents/trainers/ppo/trainer.py
Killed 53 out of 82 mutantsSurvived
Survived mutation testing. These mutants show holes in your test suite.Mutant 505
--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -18,7 +18,7 @@
from mlagents.trainers.settings import TrainerSettings, PPOSettings
-logger = get_logger(__name__)
+logger = None
class PPOTrainer(RLTrainer):
Mutant 506
--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -50,7 +50,7 @@
self.hyperparameters: PPOSettings = cast(
PPOSettings, self.trainer_settings.hyperparameters
)
- self.load = load
+ self.load = None
self.seed = seed
self.policy: Policy = None # type: ignore
Mutant 508
--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -52,7 +52,7 @@
)
self.load = load
self.seed = seed
- self.policy: Policy = None # type: ignore
+ self.policy: Policy = "" # type: ignore
def _process_trajectory(self, trajectory: Trajectory) -> None:
"""
Mutant 509
--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -61,7 +61,7 @@
:param trajectory: The Trajectory tuple containing the steps to be processed.
"""
super()._process_trajectory(trajectory)
- agent_id = trajectory.agent_id # All the agents should have the same ID
+ agent_id = None # All the agents should have the same ID
agent_buffer_trajectory = trajectory.to_agentbuffer()
# Update the normalization
Mutant 512
--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -72,7 +72,7 @@
value_estimates, value_next = self.optimizer.get_trajectory_value_estimates(
agent_buffer_trajectory,
trajectory.next_obs,
- trajectory.done_reached and not trajectory.interrupted,
+ trajectory.done_reached and trajectory.interrupted,
)
for name, v in value_estimates.items():
agent_buffer_trajectory[f"{name}_value_estimates"].extend(v)
Mutant 513
--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -72,7 +72,7 @@
value_estimates, value_next = self.optimizer.get_trajectory_value_estimates(
agent_buffer_trajectory,
trajectory.next_obs,
- trajectory.done_reached and not trajectory.interrupted,
+ trajectory.done_reached or not trajectory.interrupted,
)
for name, v in value_estimates.items():
agent_buffer_trajectory[f"{name}_value_estimates"].extend(v)
Mutant 515
--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -81,7 +81,7 @@
)
# Evaluate all reward functions
- self.collected_rewards["environment"][agent_id] += np.sum(
+ self.collected_rewards["environment"][agent_id] = np.sum(
agent_buffer_trajectory["environment_rewards"]
)
for name, reward_signal in self.optimizer.reward_signals.items():
Mutant 516
--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -81,7 +81,7 @@
)
# Evaluate all reward functions
- self.collected_rewards["environment"][agent_id] += np.sum(
+ self.collected_rewards["environment"][agent_id] -= np.sum(
agent_buffer_trajectory["environment_rewards"]
)
for name, reward_signal in self.optimizer.reward_signals.items():
Mutant 518
--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -90,7 +90,7 @@
).scaled_reward
agent_buffer_trajectory[f"{name}_rewards"].extend(evaluate_result)
# Report the reward signals
- self.collected_rewards[name][agent_id] += np.sum(evaluate_result)
+ self.collected_rewards[name][agent_id] = np.sum(evaluate_result)
# Compute GAE and returns
tmp_advantages = []
Mutant 519
--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -90,7 +90,7 @@
).scaled_reward
agent_buffer_trajectory[f"{name}_rewards"].extend(evaluate_result)
# Report the reward signals
- self.collected_rewards[name][agent_id] += np.sum(evaluate_result)
+ self.collected_rewards[name][agent_id] -= np.sum(evaluate_result)
# Compute GAE and returns
tmp_advantages = []
Mutant 524
--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -109,7 +109,7 @@
gamma=self.optimizer.reward_signals[name].gamma,
lambd=self.hyperparameters.lambd,
)
- local_return = local_advantage + local_value_estimates
+ local_return = local_advantage - local_value_estimates
# This is later use as target for the different value estimates
agent_buffer_trajectory[f"{name}_returns"].set(local_return)
agent_buffer_trajectory[f"{name}_advantage"].set(local_advantage)
Mutant 530
--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -122,7 +122,7 @@
)
global_returns = list(np.mean(np.array(tmp_returns, dtype=np.float32), axis=0))
agent_buffer_trajectory["advantages"].set(global_advantages)
- agent_buffer_trajectory["discounted_returns"].set(global_returns)
+ agent_buffer_trajectory["XXdiscounted_returnsXX"].set(global_returns)
# Append to update buffer
agent_buffer_trajectory.resequence_and_append(
self.update_buffer, training_length=self.policy.sequence_length
Mutant 534
--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -152,7 +152,7 @@
# will need to reshape the data into a batch_size x sequence_length tensor.
batch_size = (
self.hyperparameters.batch_size
- - self.hyperparameters.batch_size % self.policy.sequence_length
+ + self.hyperparameters.batch_size % self.policy.sequence_length
)
# Make sure there is at least one sequence
batch_size = max(batch_size, self.policy.sequence_length)
Mutant 537
--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -158,7 +158,7 @@
batch_size = max(batch_size, self.policy.sequence_length)
n_sequences = max(
- int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
+ int(self.hyperparameters.batch_size * self.policy.sequence_length), 1
)
advantages = self.update_buffer["advantages"].get_batch()
Mutant 538
--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -158,7 +158,7 @@
batch_size = max(batch_size, self.policy.sequence_length)
n_sequences = max(
- int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
+ int(self.hyperparameters.batch_size / self.policy.sequence_length), 2
)
advantages = self.update_buffer["advantages"].get_batch()
Mutant 544
--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -163,7 +163,7 @@
advantages = self.update_buffer["advantages"].get_batch()
self.update_buffer["advantages"].set(
- (advantages - advantages.mean()) / (advantages.std() + 1e-10)
+ (advantages - advantages.mean()) / (advantages.std() - 1e-10)
)
num_epoch = self.hyperparameters.num_epoch
batch_update_stats = defaultdict(list)
Mutant 545
--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -163,7 +163,7 @@
advantages = self.update_buffer["advantages"].get_batch()
self.update_buffer["advantages"].set(
- (advantages - advantages.mean()) / (advantages.std() + 1e-10)
+ (advantages - advantages.mean()) / (advantages.std() + 1.0000000001)
)
num_epoch = self.hyperparameters.num_epoch
batch_update_stats = defaultdict(list)
Mutant 555
--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -186,7 +186,7 @@
for stat, val in update_stats.items():
self._stats_reporter.add_stat(stat, val)
self._clear_update_buffer()
- return True
+ return False
def create_policy(
self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
Mutant 556
--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -202,7 +202,7 @@
self.trainer_settings,
model_path=self.artifact_path,
load=self.load,
- condition_sigma_on_obs=False, # Faster training for PPO
+ condition_sigma_on_obs=True, # Faster training for PPO
create_tf_graph=False, # We will create the TF graph in the Optimizer
)
Mutant 557
--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -203,7 +203,7 @@
model_path=self.artifact_path,
load=self.load,
condition_sigma_on_obs=False, # Faster training for PPO
- create_tf_graph=False, # We will create the TF graph in the Optimizer
+ create_tf_graph=True, # We will create the TF graph in the Optimizer
)
return policy
Mutant 561
--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -230,7 +230,7 @@
self.policies[parsed_behavior_id.behavior_id] = policy
self.optimizer = self.create_ppo_optimizer()
for _reward_signal in self.optimizer.reward_signals.keys():
- self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
+ self.collected_rewards[_reward_signal] = defaultdict(lambda: 1)
# Needed to resume loads properly
self.step = policy.get_current_step()
Mutant 565
--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -243,7 +243,7 @@
return self.policy
-def discount_rewards(r, gamma=0.99, value_next=0.0):
+def discount_rewards(r, gamma=1.99, value_next=0.0):
"""
Computes discounted sum of future rewards for use in updating value estimate.
:param r: List of rewards.
Mutant 566
--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -243,7 +243,7 @@
return self.policy
-def discount_rewards(r, gamma=0.99, value_next=0.0):
+def discount_rewards(r, gamma=0.99, value_next=1.0):
"""
Computes discounted sum of future rewards for use in updating value estimate.
:param r: List of rewards.
Mutant 569
--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -253,7 +253,7 @@
"""
discounted_r = np.zeros_like(r)
running_add = value_next
- for t in reversed(range(0, r.size)):
+ for t in reversed(range(1, r.size)):
running_add = running_add * gamma + r[t]
discounted_r[t] = running_add
return discounted_r
Mutant 574
--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -259,7 +259,7 @@
return discounted_r
-def get_gae(rewards, value_estimates, value_next=0.0, gamma=0.99, lambd=0.95):
+def get_gae(rewards, value_estimates, value_next=1.0, gamma=0.99, lambd=0.95):
"""
Computes generalized advantage estimate for use in updating policy.
:param rewards: list of rewards for time-steps t to T.
Mutant 575
--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -259,7 +259,7 @@
return discounted_r
-def get_gae(rewards, value_estimates, value_next=0.0, gamma=0.99, lambd=0.95):
+def get_gae(rewards, value_estimates, value_next=0.0, gamma=1.99, lambd=0.95):
"""
Computes generalized advantage estimate for use in updating policy.
:param rewards: list of rewards for time-steps t to T.
Mutant 576
--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -259,7 +259,7 @@
return discounted_r
-def get_gae(rewards, value_estimates, value_next=0.0, gamma=0.99, lambd=0.95):
+def get_gae(rewards, value_estimates, value_next=0.0, gamma=0.99, lambd=1.95):
"""
Computes generalized advantage estimate for use in updating policy.
:param rewards: list of rewards for time-steps t to T.
Mutant 578
--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -270,7 +270,7 @@
:return: list of advantage estimates for time-steps t to T.
"""
value_estimates = np.append(value_estimates, value_next)
- delta_t = rewards + gamma * value_estimates[1:] - value_estimates[:-1]
+ delta_t = rewards - gamma * value_estimates[1:] - value_estimates[:-1]
advantage = discount_rewards(r=delta_t, gamma=gamma * lambd)
return advantage
Mutant 582
--- ml-agents/mlagents/trainers/ppo/trainer.py
+++ ml-agents/mlagents/trainers/ppo/trainer.py
@@ -270,7 +270,7 @@
:return: list of advantage estimates for time-steps t to T.
"""
value_estimates = np.append(value_estimates, value_next)
- delta_t = rewards + gamma * value_estimates[1:] - value_estimates[:-1]
+ delta_t = rewards + gamma * value_estimates[1:] - value_estimates[:+1]
advantage = discount_rewards(r=delta_t, gamma=gamma * lambd)
return advantage