fairseq/models/transformer.py
Killed 22 out of 151 mutantsSurvived
Survived mutation testing. These mutants show holes in your test suite.Mutant 171
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -31,7 +31,7 @@
from torch import Tensor
-DEFAULT_MAX_SOURCE_POSITIONS = 1024
+DEFAULT_MAX_SOURCE_POSITIONS = 1025
DEFAULT_MAX_TARGET_POSITIONS = 1024
Mutant 172
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -31,7 +31,7 @@
from torch import Tensor
-DEFAULT_MAX_SOURCE_POSITIONS = 1024
+DEFAULT_MAX_SOURCE_POSITIONS = None
DEFAULT_MAX_TARGET_POSITIONS = 1024
Mutant 173
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -32,7 +32,7 @@
DEFAULT_MAX_SOURCE_POSITIONS = 1024
-DEFAULT_MAX_TARGET_POSITIONS = 1024
+DEFAULT_MAX_TARGET_POSITIONS = 1025
@register_model("transformer")
Mutant 174
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -32,7 +32,7 @@
DEFAULT_MAX_SOURCE_POSITIONS = 1024
-DEFAULT_MAX_TARGET_POSITIONS = 1024
+DEFAULT_MAX_TARGET_POSITIONS = None
@register_model("transformer")
Mutant 177
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -53,7 +53,6 @@
:prog:
"""
- @classmethod
def hub_models(cls):
# fmt: off
Mutant 178
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -91,7 +91,6 @@
self.args = args
self.supports_align_args = True
- @staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
# fmt: off
Mutant 179
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -95,7 +95,7 @@
def add_args(parser):
"""Add model-specific arguments to the parser."""
# fmt: off
- parser.add_argument('--activation-fn',
+ parser.add_argument('XX--activation-fnXX',
choices=utils.get_available_activation_fns(),
help='activation function to use')
parser.add_argument('--dropout', type=float, metavar='D',
Mutant 180
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -97,7 +97,7 @@
# fmt: off
parser.add_argument('--activation-fn',
choices=utils.get_available_activation_fns(),
- help='activation function to use')
+ help='XXactivation function to useXX')
parser.add_argument('--dropout', type=float, metavar='D',
help='dropout probability')
parser.add_argument('--attention-dropout', type=float, metavar='D',
Mutant 181
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -98,7 +98,7 @@
parser.add_argument('--activation-fn',
choices=utils.get_available_activation_fns(),
help='activation function to use')
- parser.add_argument('--dropout', type=float, metavar='D',
+ parser.add_argument('XX--dropoutXX', type=float, metavar='D',
help='dropout probability')
parser.add_argument('--attention-dropout', type=float, metavar='D',
help='dropout probability for attention weights')
Mutant 182
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -98,7 +98,7 @@
parser.add_argument('--activation-fn',
choices=utils.get_available_activation_fns(),
help='activation function to use')
- parser.add_argument('--dropout', type=float, metavar='D',
+ parser.add_argument('--dropout', type=float, metavar='XXDXX',
help='dropout probability')
parser.add_argument('--attention-dropout', type=float, metavar='D',
help='dropout probability for attention weights')
Mutant 183
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -99,7 +99,7 @@
choices=utils.get_available_activation_fns(),
help='activation function to use')
parser.add_argument('--dropout', type=float, metavar='D',
- help='dropout probability')
+ help='XXdropout probabilityXX')
parser.add_argument('--attention-dropout', type=float, metavar='D',
help='dropout probability for attention weights')
parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
Mutant 184
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -100,7 +100,7 @@
help='activation function to use')
parser.add_argument('--dropout', type=float, metavar='D',
help='dropout probability')
- parser.add_argument('--attention-dropout', type=float, metavar='D',
+ parser.add_argument('XX--attention-dropoutXX', type=float, metavar='D',
help='dropout probability for attention weights')
parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
help='dropout probability after activation in FFN.')
Mutant 185
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -100,7 +100,7 @@
help='activation function to use')
parser.add_argument('--dropout', type=float, metavar='D',
help='dropout probability')
- parser.add_argument('--attention-dropout', type=float, metavar='D',
+ parser.add_argument('--attention-dropout', type=float, metavar='XXDXX',
help='dropout probability for attention weights')
parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
help='dropout probability after activation in FFN.')
Mutant 186
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -101,7 +101,7 @@
parser.add_argument('--dropout', type=float, metavar='D',
help='dropout probability')
parser.add_argument('--attention-dropout', type=float, metavar='D',
- help='dropout probability for attention weights')
+ help='XXdropout probability for attention weightsXX')
parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
help='dropout probability after activation in FFN.')
parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
Mutant 189
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -102,7 +102,7 @@
help='dropout probability')
parser.add_argument('--attention-dropout', type=float, metavar='D',
help='dropout probability for attention weights')
- parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
+ parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='XXDXX',
help='dropout probability after activation in FFN.')
parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
help='path to pre-trained encoder embedding')
Mutant 190
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -103,7 +103,7 @@
parser.add_argument('--attention-dropout', type=float, metavar='D',
help='dropout probability for attention weights')
parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
- help='dropout probability after activation in FFN.')
+ help='XXdropout probability after activation in FFN.XX')
parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
help='path to pre-trained encoder embedding')
parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
Mutant 191
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -104,7 +104,7 @@
help='dropout probability for attention weights')
parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
help='dropout probability after activation in FFN.')
- parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
+ parser.add_argument('XX--encoder-embed-pathXX', type=str, metavar='STR',
help='path to pre-trained encoder embedding')
parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
help='encoder embedding dimension')
Mutant 192
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -104,7 +104,7 @@
help='dropout probability for attention weights')
parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
help='dropout probability after activation in FFN.')
- parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
+ parser.add_argument('--encoder-embed-path', type=str, metavar='XXSTRXX',
help='path to pre-trained encoder embedding')
parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
help='encoder embedding dimension')
Mutant 193
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -105,7 +105,7 @@
parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
help='dropout probability after activation in FFN.')
parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
- help='path to pre-trained encoder embedding')
+ help='XXpath to pre-trained encoder embeddingXX')
parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
help='encoder embedding dimension')
parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
Mutant 194
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -106,7 +106,7 @@
help='dropout probability after activation in FFN.')
parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
help='path to pre-trained encoder embedding')
- parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
+ parser.add_argument('XX--encoder-embed-dimXX', type=int, metavar='N',
help='encoder embedding dimension')
parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
help='encoder embedding dimension for FFN')
Mutant 195
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -106,7 +106,7 @@
help='dropout probability after activation in FFN.')
parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
help='path to pre-trained encoder embedding')
- parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
+ parser.add_argument('--encoder-embed-dim', type=int, metavar='XXNXX',
help='encoder embedding dimension')
parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
help='encoder embedding dimension for FFN')
Mutant 196
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -107,7 +107,7 @@
parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
help='path to pre-trained encoder embedding')
parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
- help='encoder embedding dimension')
+ help='XXencoder embedding dimensionXX')
parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
help='encoder embedding dimension for FFN')
parser.add_argument('--encoder-layers', type=int, metavar='N',
Mutant 197
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -108,7 +108,7 @@
help='path to pre-trained encoder embedding')
parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
help='encoder embedding dimension')
- parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
+ parser.add_argument('XX--encoder-ffn-embed-dimXX', type=int, metavar='N',
help='encoder embedding dimension for FFN')
parser.add_argument('--encoder-layers', type=int, metavar='N',
help='num encoder layers')
Mutant 198
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -108,7 +108,7 @@
help='path to pre-trained encoder embedding')
parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
help='encoder embedding dimension')
- parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
+ parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='XXNXX',
help='encoder embedding dimension for FFN')
parser.add_argument('--encoder-layers', type=int, metavar='N',
help='num encoder layers')
Mutant 199
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -109,7 +109,7 @@
parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
help='encoder embedding dimension')
parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
- help='encoder embedding dimension for FFN')
+ help='XXencoder embedding dimension for FFNXX')
parser.add_argument('--encoder-layers', type=int, metavar='N',
help='num encoder layers')
parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
Mutant 200
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -110,7 +110,7 @@
help='encoder embedding dimension')
parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
help='encoder embedding dimension for FFN')
- parser.add_argument('--encoder-layers', type=int, metavar='N',
+ parser.add_argument('XX--encoder-layersXX', type=int, metavar='N',
help='num encoder layers')
parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
help='num encoder attention heads')
Mutant 201
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -110,7 +110,7 @@
help='encoder embedding dimension')
parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
help='encoder embedding dimension for FFN')
- parser.add_argument('--encoder-layers', type=int, metavar='N',
+ parser.add_argument('--encoder-layers', type=int, metavar='XXNXX',
help='num encoder layers')
parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
help='num encoder attention heads')
Mutant 202
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -111,7 +111,7 @@
parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
help='encoder embedding dimension for FFN')
parser.add_argument('--encoder-layers', type=int, metavar='N',
- help='num encoder layers')
+ help='XXnum encoder layersXX')
parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
help='num encoder attention heads')
parser.add_argument('--encoder-normalize-before', action='store_true',
Mutant 203
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -112,7 +112,7 @@
help='encoder embedding dimension for FFN')
parser.add_argument('--encoder-layers', type=int, metavar='N',
help='num encoder layers')
- parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
+ parser.add_argument('XX--encoder-attention-headsXX', type=int, metavar='N',
help='num encoder attention heads')
parser.add_argument('--encoder-normalize-before', action='store_true',
help='apply layernorm before each encoder block')
Mutant 204
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -112,7 +112,7 @@
help='encoder embedding dimension for FFN')
parser.add_argument('--encoder-layers', type=int, metavar='N',
help='num encoder layers')
- parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
+ parser.add_argument('--encoder-attention-heads', type=int, metavar='XXNXX',
help='num encoder attention heads')
parser.add_argument('--encoder-normalize-before', action='store_true',
help='apply layernorm before each encoder block')
Mutant 205
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -113,7 +113,7 @@
parser.add_argument('--encoder-layers', type=int, metavar='N',
help='num encoder layers')
parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
- help='num encoder attention heads')
+ help='XXnum encoder attention headsXX')
parser.add_argument('--encoder-normalize-before', action='store_true',
help='apply layernorm before each encoder block')
parser.add_argument('--encoder-learned-pos', action='store_true',
Mutant 206
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -114,7 +114,7 @@
help='num encoder layers')
parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
help='num encoder attention heads')
- parser.add_argument('--encoder-normalize-before', action='store_true',
+ parser.add_argument('XX--encoder-normalize-beforeXX', action='store_true',
help='apply layernorm before each encoder block')
parser.add_argument('--encoder-learned-pos', action='store_true',
help='use learned positional embeddings in the encoder')
Mutant 208
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -115,7 +115,7 @@
parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
help='num encoder attention heads')
parser.add_argument('--encoder-normalize-before', action='store_true',
- help='apply layernorm before each encoder block')
+ help='XXapply layernorm before each encoder blockXX')
parser.add_argument('--encoder-learned-pos', action='store_true',
help='use learned positional embeddings in the encoder')
parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
Mutant 209
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -116,7 +116,7 @@
help='num encoder attention heads')
parser.add_argument('--encoder-normalize-before', action='store_true',
help='apply layernorm before each encoder block')
- parser.add_argument('--encoder-learned-pos', action='store_true',
+ parser.add_argument('XX--encoder-learned-posXX', action='store_true',
help='use learned positional embeddings in the encoder')
parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
help='path to pre-trained decoder embedding')
Mutant 211
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -117,7 +117,7 @@
parser.add_argument('--encoder-normalize-before', action='store_true',
help='apply layernorm before each encoder block')
parser.add_argument('--encoder-learned-pos', action='store_true',
- help='use learned positional embeddings in the encoder')
+ help='XXuse learned positional embeddings in the encoderXX')
parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
help='path to pre-trained decoder embedding')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
Mutant 212
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -118,7 +118,7 @@
help='apply layernorm before each encoder block')
parser.add_argument('--encoder-learned-pos', action='store_true',
help='use learned positional embeddings in the encoder')
- parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
+ parser.add_argument('XX--decoder-embed-pathXX', type=str, metavar='STR',
help='path to pre-trained decoder embedding')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension')
Mutant 213
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -118,7 +118,7 @@
help='apply layernorm before each encoder block')
parser.add_argument('--encoder-learned-pos', action='store_true',
help='use learned positional embeddings in the encoder')
- parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
+ parser.add_argument('--decoder-embed-path', type=str, metavar='XXSTRXX',
help='path to pre-trained decoder embedding')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension')
Mutant 214
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -119,7 +119,7 @@
parser.add_argument('--encoder-learned-pos', action='store_true',
help='use learned positional embeddings in the encoder')
parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
- help='path to pre-trained decoder embedding')
+ help='XXpath to pre-trained decoder embeddingXX')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension')
parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
Mutant 215
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -120,7 +120,7 @@
help='use learned positional embeddings in the encoder')
parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
help='path to pre-trained decoder embedding')
- parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
+ parser.add_argument('XX--decoder-embed-dimXX', type=int, metavar='N',
help='decoder embedding dimension')
parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
help='decoder embedding dimension for FFN')
Mutant 216
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -120,7 +120,7 @@
help='use learned positional embeddings in the encoder')
parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
help='path to pre-trained decoder embedding')
- parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
+ parser.add_argument('--decoder-embed-dim', type=int, metavar='XXNXX',
help='decoder embedding dimension')
parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
help='decoder embedding dimension for FFN')
Mutant 217
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -121,7 +121,7 @@
parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
help='path to pre-trained decoder embedding')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
- help='decoder embedding dimension')
+ help='XXdecoder embedding dimensionXX')
parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
help='decoder embedding dimension for FFN')
parser.add_argument('--decoder-layers', type=int, metavar='N',
Mutant 218
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -122,7 +122,7 @@
help='path to pre-trained decoder embedding')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension')
- parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
+ parser.add_argument('XX--decoder-ffn-embed-dimXX', type=int, metavar='N',
help='decoder embedding dimension for FFN')
parser.add_argument('--decoder-layers', type=int, metavar='N',
help='num decoder layers')
Mutant 219
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -122,7 +122,7 @@
help='path to pre-trained decoder embedding')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension')
- parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
+ parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='XXNXX',
help='decoder embedding dimension for FFN')
parser.add_argument('--decoder-layers', type=int, metavar='N',
help='num decoder layers')
Mutant 220
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -123,7 +123,7 @@
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension')
parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
- help='decoder embedding dimension for FFN')
+ help='XXdecoder embedding dimension for FFNXX')
parser.add_argument('--decoder-layers', type=int, metavar='N',
help='num decoder layers')
parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
Mutant 221
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -124,7 +124,7 @@
help='decoder embedding dimension')
parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
help='decoder embedding dimension for FFN')
- parser.add_argument('--decoder-layers', type=int, metavar='N',
+ parser.add_argument('XX--decoder-layersXX', type=int, metavar='N',
help='num decoder layers')
parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
help='num decoder attention heads')
Mutant 222
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -124,7 +124,7 @@
help='decoder embedding dimension')
parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
help='decoder embedding dimension for FFN')
- parser.add_argument('--decoder-layers', type=int, metavar='N',
+ parser.add_argument('--decoder-layers', type=int, metavar='XXNXX',
help='num decoder layers')
parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
help='num decoder attention heads')
Mutant 223
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -125,7 +125,7 @@
parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
help='decoder embedding dimension for FFN')
parser.add_argument('--decoder-layers', type=int, metavar='N',
- help='num decoder layers')
+ help='XXnum decoder layersXX')
parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
help='num decoder attention heads')
parser.add_argument('--decoder-learned-pos', action='store_true',
Mutant 224
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -126,7 +126,7 @@
help='decoder embedding dimension for FFN')
parser.add_argument('--decoder-layers', type=int, metavar='N',
help='num decoder layers')
- parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
+ parser.add_argument('XX--decoder-attention-headsXX', type=int, metavar='N',
help='num decoder attention heads')
parser.add_argument('--decoder-learned-pos', action='store_true',
help='use learned positional embeddings in the decoder')
Mutant 225
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -126,7 +126,7 @@
help='decoder embedding dimension for FFN')
parser.add_argument('--decoder-layers', type=int, metavar='N',
help='num decoder layers')
- parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
+ parser.add_argument('--decoder-attention-heads', type=int, metavar='XXNXX',
help='num decoder attention heads')
parser.add_argument('--decoder-learned-pos', action='store_true',
help='use learned positional embeddings in the decoder')
Mutant 226
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -127,7 +127,7 @@
parser.add_argument('--decoder-layers', type=int, metavar='N',
help='num decoder layers')
parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
- help='num decoder attention heads')
+ help='XXnum decoder attention headsXX')
parser.add_argument('--decoder-learned-pos', action='store_true',
help='use learned positional embeddings in the decoder')
parser.add_argument('--decoder-normalize-before', action='store_true',
Mutant 227
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -128,7 +128,7 @@
help='num decoder layers')
parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
help='num decoder attention heads')
- parser.add_argument('--decoder-learned-pos', action='store_true',
+ parser.add_argument('XX--decoder-learned-posXX', action='store_true',
help='use learned positional embeddings in the decoder')
parser.add_argument('--decoder-normalize-before', action='store_true',
help='apply layernorm before each decoder block')
Mutant 229
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -129,7 +129,7 @@
parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
help='num decoder attention heads')
parser.add_argument('--decoder-learned-pos', action='store_true',
- help='use learned positional embeddings in the decoder')
+ help='XXuse learned positional embeddings in the decoderXX')
parser.add_argument('--decoder-normalize-before', action='store_true',
help='apply layernorm before each decoder block')
parser.add_argument('--decoder-output-dim', type=int, metavar='N',
Mutant 230
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -130,7 +130,7 @@
help='num decoder attention heads')
parser.add_argument('--decoder-learned-pos', action='store_true',
help='use learned positional embeddings in the decoder')
- parser.add_argument('--decoder-normalize-before', action='store_true',
+ parser.add_argument('XX--decoder-normalize-beforeXX', action='store_true',
help='apply layernorm before each decoder block')
parser.add_argument('--decoder-output-dim', type=int, metavar='N',
help='decoder output dimension (extra linear layer '
Mutant 232
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -131,7 +131,7 @@
parser.add_argument('--decoder-learned-pos', action='store_true',
help='use learned positional embeddings in the decoder')
parser.add_argument('--decoder-normalize-before', action='store_true',
- help='apply layernorm before each decoder block')
+ help='XXapply layernorm before each decoder blockXX')
parser.add_argument('--decoder-output-dim', type=int, metavar='N',
help='decoder output dimension (extra linear layer '
'if different from decoder embed dim')
Mutant 233
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -132,7 +132,7 @@
help='use learned positional embeddings in the decoder')
parser.add_argument('--decoder-normalize-before', action='store_true',
help='apply layernorm before each decoder block')
- parser.add_argument('--decoder-output-dim', type=int, metavar='N',
+ parser.add_argument('XX--decoder-output-dimXX', type=int, metavar='N',
help='decoder output dimension (extra linear layer '
'if different from decoder embed dim')
parser.add_argument('--share-decoder-input-output-embed', action='store_true',
Mutant 234
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -132,7 +132,7 @@
help='use learned positional embeddings in the decoder')
parser.add_argument('--decoder-normalize-before', action='store_true',
help='apply layernorm before each decoder block')
- parser.add_argument('--decoder-output-dim', type=int, metavar='N',
+ parser.add_argument('--decoder-output-dim', type=int, metavar='XXNXX',
help='decoder output dimension (extra linear layer '
'if different from decoder embed dim')
parser.add_argument('--share-decoder-input-output-embed', action='store_true',
Mutant 235
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -133,7 +133,7 @@
parser.add_argument('--decoder-normalize-before', action='store_true',
help='apply layernorm before each decoder block')
parser.add_argument('--decoder-output-dim', type=int, metavar='N',
- help='decoder output dimension (extra linear layer '
+ help='XXdecoder output dimension (extra linear layer XX'
'if different from decoder embed dim')
parser.add_argument('--share-decoder-input-output-embed', action='store_true',
help='share decoder input and output embeddings')
Mutant 236
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -135,7 +135,7 @@
parser.add_argument('--decoder-output-dim', type=int, metavar='N',
help='decoder output dimension (extra linear layer '
'if different from decoder embed dim')
- parser.add_argument('--share-decoder-input-output-embed', action='store_true',
+ parser.add_argument('XX--share-decoder-input-output-embedXX', action='store_true',
help='share decoder input and output embeddings')
parser.add_argument('--share-all-embeddings', action='store_true',
help='share encoder, decoder and output embeddings'
Mutant 238
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -136,7 +136,7 @@
help='decoder output dimension (extra linear layer '
'if different from decoder embed dim')
parser.add_argument('--share-decoder-input-output-embed', action='store_true',
- help='share decoder input and output embeddings')
+ help='XXshare decoder input and output embeddingsXX')
parser.add_argument('--share-all-embeddings', action='store_true',
help='share encoder, decoder and output embeddings'
' (requires shared dictionary and embed dim)')
Mutant 239
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -137,7 +137,7 @@
'if different from decoder embed dim')
parser.add_argument('--share-decoder-input-output-embed', action='store_true',
help='share decoder input and output embeddings')
- parser.add_argument('--share-all-embeddings', action='store_true',
+ parser.add_argument('XX--share-all-embeddingsXX', action='store_true',
help='share encoder, decoder and output embeddings'
' (requires shared dictionary and embed dim)')
parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
Mutant 241
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -138,7 +138,7 @@
parser.add_argument('--share-decoder-input-output-embed', action='store_true',
help='share decoder input and output embeddings')
parser.add_argument('--share-all-embeddings', action='store_true',
- help='share encoder, decoder and output embeddings'
+ help='XXshare encoder, decoder and output embeddingsXX'
' (requires shared dictionary and embed dim)')
parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
help='if set, disables positional embeddings (outside self attention)')
Mutant 242
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -140,7 +140,7 @@
parser.add_argument('--share-all-embeddings', action='store_true',
help='share encoder, decoder and output embeddings'
' (requires shared dictionary and embed dim)')
- parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
+ parser.add_argument('XX--no-token-positional-embeddingsXX', default=False, action='store_true',
help='if set, disables positional embeddings (outside self attention)')
parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
help='comma separated list of adaptive softmax cutoff points. '
Mutant 243
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -140,7 +140,7 @@
parser.add_argument('--share-all-embeddings', action='store_true',
help='share encoder, decoder and output embeddings'
' (requires shared dictionary and embed dim)')
- parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
+ parser.add_argument('--no-token-positional-embeddings', default=True, action='store_true',
help='if set, disables positional embeddings (outside self attention)')
parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
help='comma separated list of adaptive softmax cutoff points. '
Mutant 245
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -141,7 +141,7 @@
help='share encoder, decoder and output embeddings'
' (requires shared dictionary and embed dim)')
parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
- help='if set, disables positional embeddings (outside self attention)')
+ help='XXif set, disables positional embeddings (outside self attention)XX')
parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
help='comma separated list of adaptive softmax cutoff points. '
'Must be used with adaptive_loss criterion'),
Mutant 246
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -142,7 +142,7 @@
' (requires shared dictionary and embed dim)')
parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
help='if set, disables positional embeddings (outside self attention)')
- parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
+ parser.add_argument('XX--adaptive-softmax-cutoffXX', metavar='EXPR',
help='comma separated list of adaptive softmax cutoff points. '
'Must be used with adaptive_loss criterion'),
parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
Mutant 247
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -142,7 +142,7 @@
' (requires shared dictionary and embed dim)')
parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
help='if set, disables positional embeddings (outside self attention)')
- parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
+ parser.add_argument('--adaptive-softmax-cutoff', metavar='XXEXPRXX',
help='comma separated list of adaptive softmax cutoff points. '
'Must be used with adaptive_loss criterion'),
parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
Mutant 248
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -143,7 +143,7 @@
parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
help='if set, disables positional embeddings (outside self attention)')
parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
- help='comma separated list of adaptive softmax cutoff points. '
+ help='XXcomma separated list of adaptive softmax cutoff points. XX'
'Must be used with adaptive_loss criterion'),
parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
help='sets adaptive softmax dropout for the tail projections')
Mutant 249
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -145,7 +145,7 @@
parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
help='comma separated list of adaptive softmax cutoff points. '
'Must be used with adaptive_loss criterion'),
- parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
+ parser.add_argument('XX--adaptive-softmax-dropoutXX', type=float, metavar='D',
help='sets adaptive softmax dropout for the tail projections')
parser.add_argument('--layernorm-embedding', action='store_true',
help='add layernorm to embedding')
Mutant 250
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -145,7 +145,7 @@
parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
help='comma separated list of adaptive softmax cutoff points. '
'Must be used with adaptive_loss criterion'),
- parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
+ parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='XXDXX',
help='sets adaptive softmax dropout for the tail projections')
parser.add_argument('--layernorm-embedding', action='store_true',
help='add layernorm to embedding')
Mutant 251
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -146,7 +146,7 @@
help='comma separated list of adaptive softmax cutoff points. '
'Must be used with adaptive_loss criterion'),
parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
- help='sets adaptive softmax dropout for the tail projections')
+ help='XXsets adaptive softmax dropout for the tail projectionsXX')
parser.add_argument('--layernorm-embedding', action='store_true',
help='add layernorm to embedding')
parser.add_argument('--no-scale-embedding', action='store_true',
Mutant 252
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -147,7 +147,7 @@
'Must be used with adaptive_loss criterion'),
parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
help='sets adaptive softmax dropout for the tail projections')
- parser.add_argument('--layernorm-embedding', action='store_true',
+ parser.add_argument('XX--layernorm-embeddingXX', action='store_true',
help='add layernorm to embedding')
parser.add_argument('--no-scale-embedding', action='store_true',
help='if True, dont scale embeddings')
Mutant 254
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -148,7 +148,7 @@
parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
help='sets adaptive softmax dropout for the tail projections')
parser.add_argument('--layernorm-embedding', action='store_true',
- help='add layernorm to embedding')
+ help='XXadd layernorm to embeddingXX')
parser.add_argument('--no-scale-embedding', action='store_true',
help='if True, dont scale embeddings')
# args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019)
Mutant 255
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -149,7 +149,7 @@
help='sets adaptive softmax dropout for the tail projections')
parser.add_argument('--layernorm-embedding', action='store_true',
help='add layernorm to embedding')
- parser.add_argument('--no-scale-embedding', action='store_true',
+ parser.add_argument('XX--no-scale-embeddingXX', action='store_true',
help='if True, dont scale embeddings')
# args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019)
parser.add_argument('--no-cross-attention', default=False, action='store_true',
Mutant 257
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -150,7 +150,7 @@
parser.add_argument('--layernorm-embedding', action='store_true',
help='add layernorm to embedding')
parser.add_argument('--no-scale-embedding', action='store_true',
- help='if True, dont scale embeddings')
+ help='XXif True, dont scale embeddingsXX')
# args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019)
parser.add_argument('--no-cross-attention', default=False, action='store_true',
help='do not perform cross-attention')
Mutant 258
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -152,7 +152,7 @@
parser.add_argument('--no-scale-embedding', action='store_true',
help='if True, dont scale embeddings')
# args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019)
- parser.add_argument('--no-cross-attention', default=False, action='store_true',
+ parser.add_argument('XX--no-cross-attentionXX', default=False, action='store_true',
help='do not perform cross-attention')
parser.add_argument('--cross-self-attention', default=False, action='store_true',
help='perform cross+self-attention')
Mutant 259
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -152,7 +152,7 @@
parser.add_argument('--no-scale-embedding', action='store_true',
help='if True, dont scale embeddings')
# args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019)
- parser.add_argument('--no-cross-attention', default=False, action='store_true',
+ parser.add_argument('--no-cross-attention', default=True, action='store_true',
help='do not perform cross-attention')
parser.add_argument('--cross-self-attention', default=False, action='store_true',
help='perform cross+self-attention')
Mutant 261
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -153,7 +153,7 @@
help='if True, dont scale embeddings')
# args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019)
parser.add_argument('--no-cross-attention', default=False, action='store_true',
- help='do not perform cross-attention')
+ help='XXdo not perform cross-attentionXX')
parser.add_argument('--cross-self-attention', default=False, action='store_true',
help='perform cross+self-attention')
# args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
Mutant 262
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -154,7 +154,7 @@
# args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019)
parser.add_argument('--no-cross-attention', default=False, action='store_true',
help='do not perform cross-attention')
- parser.add_argument('--cross-self-attention', default=False, action='store_true',
+ parser.add_argument('XX--cross-self-attentionXX', default=False, action='store_true',
help='perform cross+self-attention')
# args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
Mutant 263
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -154,7 +154,7 @@
# args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019)
parser.add_argument('--no-cross-attention', default=False, action='store_true',
help='do not perform cross-attention')
- parser.add_argument('--cross-self-attention', default=False, action='store_true',
+ parser.add_argument('--cross-self-attention', default=True, action='store_true',
help='perform cross+self-attention')
# args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
Mutant 265
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -155,7 +155,7 @@
parser.add_argument('--no-cross-attention', default=False, action='store_true',
help='do not perform cross-attention')
parser.add_argument('--cross-self-attention', default=False, action='store_true',
- help='perform cross+self-attention')
+ help='XXperform cross+self-attentionXX')
# args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
help='LayerDrop probability for encoder')
Mutant 266
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -157,7 +157,7 @@
parser.add_argument('--cross-self-attention', default=False, action='store_true',
help='perform cross+self-attention')
# args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
- parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
+ parser.add_argument('XX--encoder-layerdropXX', type=float, metavar='D', default=0,
help='LayerDrop probability for encoder')
parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
help='LayerDrop probability for decoder')
Mutant 267
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -157,7 +157,7 @@
parser.add_argument('--cross-self-attention', default=False, action='store_true',
help='perform cross+self-attention')
# args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
- parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
+ parser.add_argument('--encoder-layerdrop', type=float, metavar='XXDXX', default=0,
help='LayerDrop probability for encoder')
parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
help='LayerDrop probability for decoder')
Mutant 268
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -157,7 +157,7 @@
parser.add_argument('--cross-self-attention', default=False, action='store_true',
help='perform cross+self-attention')
# args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
- parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
+ parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=1,
help='LayerDrop probability for encoder')
parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
help='LayerDrop probability for decoder')
Mutant 269
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -158,7 +158,7 @@
help='perform cross+self-attention')
# args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
- help='LayerDrop probability for encoder')
+ help='XXLayerDrop probability for encoderXX')
parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
help='LayerDrop probability for decoder')
parser.add_argument('--encoder-layers-to-keep', default=None,
Mutant 270
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -159,7 +159,7 @@
# args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
help='LayerDrop probability for encoder')
- parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
+ parser.add_argument('XX--decoder-layerdropXX', type=float, metavar='D', default=0,
help='LayerDrop probability for decoder')
parser.add_argument('--encoder-layers-to-keep', default=None,
help='which layers to *keep* when pruning as a comma-separated list')
Mutant 271
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -159,7 +159,7 @@
# args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
help='LayerDrop probability for encoder')
- parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
+ parser.add_argument('--decoder-layerdrop', type=float, metavar='XXDXX', default=0,
help='LayerDrop probability for decoder')
parser.add_argument('--encoder-layers-to-keep', default=None,
help='which layers to *keep* when pruning as a comma-separated list')
Mutant 272
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -159,7 +159,7 @@
# args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
help='LayerDrop probability for encoder')
- parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
+ parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=1,
help='LayerDrop probability for decoder')
parser.add_argument('--encoder-layers-to-keep', default=None,
help='which layers to *keep* when pruning as a comma-separated list')
Mutant 273
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -160,7 +160,7 @@
parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
help='LayerDrop probability for encoder')
parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
- help='LayerDrop probability for decoder')
+ help='XXLayerDrop probability for decoderXX')
parser.add_argument('--encoder-layers-to-keep', default=None,
help='which layers to *keep* when pruning as a comma-separated list')
parser.add_argument('--decoder-layers-to-keep', default=None,
Mutant 274
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -161,7 +161,7 @@
help='LayerDrop probability for encoder')
parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
help='LayerDrop probability for decoder')
- parser.add_argument('--encoder-layers-to-keep', default=None,
+ parser.add_argument('XX--encoder-layers-to-keepXX', default=None,
help='which layers to *keep* when pruning as a comma-separated list')
parser.add_argument('--decoder-layers-to-keep', default=None,
help='which layers to *keep* when pruning as a comma-separated list')
Mutant 275
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -162,7 +162,7 @@
parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
help='LayerDrop probability for decoder')
parser.add_argument('--encoder-layers-to-keep', default=None,
- help='which layers to *keep* when pruning as a comma-separated list')
+ help='XXwhich layers to *keep* when pruning as a comma-separated listXX')
parser.add_argument('--decoder-layers-to-keep', default=None,
help='which layers to *keep* when pruning as a comma-separated list')
# args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
Mutant 276
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -163,7 +163,7 @@
help='LayerDrop probability for decoder')
parser.add_argument('--encoder-layers-to-keep', default=None,
help='which layers to *keep* when pruning as a comma-separated list')
- parser.add_argument('--decoder-layers-to-keep', default=None,
+ parser.add_argument('XX--decoder-layers-to-keepXX', default=None,
help='which layers to *keep* when pruning as a comma-separated list')
# args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
Mutant 277
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -164,7 +164,7 @@
parser.add_argument('--encoder-layers-to-keep', default=None,
help='which layers to *keep* when pruning as a comma-separated list')
parser.add_argument('--decoder-layers-to-keep', default=None,
- help='which layers to *keep* when pruning as a comma-separated list')
+ help='XXwhich layers to *keep* when pruning as a comma-separated listXX')
# args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
help='iterative PQ quantization noise at training time')
Mutant 278
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -166,7 +166,7 @@
parser.add_argument('--decoder-layers-to-keep', default=None,
help='which layers to *keep* when pruning as a comma-separated list')
# args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
- parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
+ parser.add_argument('XX--quant-noise-pqXX', type=float, metavar='D', default=0,
help='iterative PQ quantization noise at training time')
parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
help='block size of quantization noise at training time')
Mutant 279
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -166,7 +166,7 @@
parser.add_argument('--decoder-layers-to-keep', default=None,
help='which layers to *keep* when pruning as a comma-separated list')
# args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
- parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
+ parser.add_argument('--quant-noise-pq', type=float, metavar='XXDXX', default=0,
help='iterative PQ quantization noise at training time')
parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
help='block size of quantization noise at training time')
Mutant 280
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -166,7 +166,7 @@
parser.add_argument('--decoder-layers-to-keep', default=None,
help='which layers to *keep* when pruning as a comma-separated list')
# args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
- parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
+ parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=1,
help='iterative PQ quantization noise at training time')
parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
help='block size of quantization noise at training time')
Mutant 281
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -167,7 +167,7 @@
help='which layers to *keep* when pruning as a comma-separated list')
# args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
- help='iterative PQ quantization noise at training time')
+ help='XXiterative PQ quantization noise at training timeXX')
parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
help='block size of quantization noise at training time')
parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
Mutant 282
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -168,7 +168,7 @@
# args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
help='iterative PQ quantization noise at training time')
- parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
+ parser.add_argument('XX--quant-noise-pq-block-sizeXX', type=int, metavar='D', default=8,
help='block size of quantization noise at training time')
parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
help='scalar quantization noise and scalar quantization at training time')
Mutant 283
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -168,7 +168,7 @@
# args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
help='iterative PQ quantization noise at training time')
- parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
+ parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='XXDXX', default=8,
help='block size of quantization noise at training time')
parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
help='scalar quantization noise and scalar quantization at training time')
Mutant 284
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -168,7 +168,7 @@
# args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
help='iterative PQ quantization noise at training time')
- parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
+ parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=9,
help='block size of quantization noise at training time')
parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
help='scalar quantization noise and scalar quantization at training time')
Mutant 285
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -169,7 +169,7 @@
parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
help='iterative PQ quantization noise at training time')
parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
- help='block size of quantization noise at training time')
+ help='XXblock size of quantization noise at training timeXX')
parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
help='scalar quantization noise and scalar quantization at training time')
# fmt: on
Mutant 286
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -170,7 +170,7 @@
help='iterative PQ quantization noise at training time')
parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
help='block size of quantization noise at training time')
- parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
+ parser.add_argument('XX--quant-noise-scalarXX', type=float, metavar='D', default=0,
help='scalar quantization noise and scalar quantization at training time')
# fmt: on
Mutant 287
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -170,7 +170,7 @@
help='iterative PQ quantization noise at training time')
parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
help='block size of quantization noise at training time')
- parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
+ parser.add_argument('--quant-noise-scalar', type=float, metavar='XXDXX', default=0,
help='scalar quantization noise and scalar quantization at training time')
# fmt: on
Mutant 288
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -170,7 +170,7 @@
help='iterative PQ quantization noise at training time')
parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
help='block size of quantization noise at training time')
- parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
+ parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=1,
help='scalar quantization noise and scalar quantization at training time')
# fmt: on
Mutant 289
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -171,7 +171,7 @@
parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
help='block size of quantization noise at training time')
parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
- help='scalar quantization noise and scalar quantization at training time')
+ help='XXscalar quantization noise and scalar quantization at training timeXX')
# fmt: on
@classmethod
Mutant 290
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -172,9 +172,7 @@
help='block size of quantization noise at training time')
parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
help='scalar quantization noise and scalar quantization at training time')
- # fmt: on
-
- @classmethod
+
def build_model(cls, args, task):
"""Build a new model instance."""
Mutant 291
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -223,7 +223,6 @@
decoder = cls.build_decoder(args, tgt_dict, decoder_embed_tokens)
return cls(args, encoder, decoder)
- @classmethod
def build_embedding(cls, args, dictionary, embed_dim, path=None):
num_embeddings = len(dictionary)
padding_idx = dictionary.pad()
Mutant 292
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -235,7 +235,6 @@
utils.load_embedding(embed_dict, dictionary, emb)
return emb
- @classmethod
def build_encoder(cls, args, src_dict, embed_tokens):
return TransformerEncoder(args, src_dict, embed_tokens)
Mutant 293
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -239,7 +239,6 @@
def build_encoder(cls, args, src_dict, embed_tokens):
return TransformerEncoder(args, src_dict, embed_tokens)
- @classmethod
def build_decoder(cls, args, tgt_dict, embed_tokens):
return TransformerDecoder(
args,
Mutant 294
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -280,10 +280,6 @@
)
return decoder_out
- # Since get_normalized_probs is in the Fairseq Model which is not scriptable,
- # I rewrite the get_normalized_probs from Base Class to call the
- # helper function in the Base Class.
- @torch.jit.export
def get_normalized_probs(
self,
net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]],
Mutant 295
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -374,7 +374,7 @@
x = self.quant_noise(x)
return x, embed
- def forward(self, src_tokens, src_lengths, return_all_hiddens: bool = False):
+ def forward(self, src_tokens, src_lengths, return_all_hiddens: bool = True):
"""
Args:
src_tokens (LongTensor): tokens in the source language of shape
Mutant 296
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -425,7 +425,6 @@
src_lengths=None,
)
- @torch.jit.export
def reorder_encoder_out(self, encoder_out: EncoderOut, new_order):
"""
Reorder encoder output according to *new_order*.
Mutant 297
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -526,7 +526,7 @@
(default: False).
"""
- def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
+ def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=True):
self.args = args
super().__init__(dictionary)
self.register_buffer("version", torch.Tensor([3]))
Mutant 298
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -633,7 +633,7 @@
self.output_projection.weight, mean=0, std=self.output_embed_dim ** -0.5
)
- def build_decoder_layer(self, args, no_encoder_attn=False):
+ def build_decoder_layer(self, args, no_encoder_attn=True):
return TransformerDecoderLayer(args, no_encoder_attn)
def forward(
Mutant 299
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -645,7 +645,7 @@
alignment_layer: Optional[int] = None,
alignment_heads: Optional[int] = None,
src_lengths: Optional[Any] = None,
- return_all_hiddens: bool = False,
+ return_all_hiddens: bool = True,
):
"""
Args:
Mutant 300
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -891,7 +891,7 @@
return m
-def Linear(in_features, out_features, bias=True):
+def Linear(in_features, out_features, bias=False):
m = nn.Linear(in_features, out_features, bias)
nn.init.xavier_uniform_(m.weight)
if bias:
Mutant 302
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -899,7 +899,7 @@
return m
-@register_model_architecture("transformer", "transformer")
+@register_model_architecture("transformer", "XXtransformerXX")
def base_architecture(args):
args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
Mutant 303
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -898,8 +898,6 @@
nn.init.constant_(m.bias, 0.0)
return m
-
-@register_model_architecture("transformer", "transformer")
def base_architecture(args):
args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
Mutant 305
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -944,7 +944,7 @@
args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False)
-@register_model_architecture("transformer", "transformer_iwslt_de_en")
+@register_model_architecture("transformer", "XXtransformer_iwslt_de_enXX")
def transformer_iwslt_de_en(args):
args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1024)
Mutant 306
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -943,8 +943,6 @@
args.layernorm_embedding = getattr(args, "layernorm_embedding", False)
args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False)
-
-@register_model_architecture("transformer", "transformer_iwslt_de_en")
def transformer_iwslt_de_en(args):
args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1024)
Mutant 308
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -957,7 +957,7 @@
base_architecture(args)
-@register_model_architecture("transformer", "transformer_wmt_en_de")
+@register_model_architecture("transformer", "XXtransformer_wmt_en_deXX")
def transformer_wmt_en_de(args):
base_architecture(args)
Mutant 309
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -956,8 +956,6 @@
args.decoder_layers = getattr(args, "decoder_layers", 6)
base_architecture(args)
-
-@register_model_architecture("transformer", "transformer_wmt_en_de")
def transformer_wmt_en_de(args):
base_architecture(args)
Mutant 311
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -963,7 +963,7 @@
# parameters used in the "Attention Is All You Need" paper (Vaswani et al., 2017)
-@register_model_architecture("transformer", "transformer_vaswani_wmt_en_de_big")
+@register_model_architecture("transformer", "XXtransformer_vaswani_wmt_en_de_bigXX")
def transformer_vaswani_wmt_en_de_big(args):
args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
Mutant 312
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -961,9 +961,6 @@
def transformer_wmt_en_de(args):
base_architecture(args)
-
-# parameters used in the "Attention Is All You Need" paper (Vaswani et al., 2017)
-@register_model_architecture("transformer", "transformer_vaswani_wmt_en_de_big")
def transformer_vaswani_wmt_en_de_big(args):
args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
Mutant 314
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -976,7 +976,7 @@
base_architecture(args)
-@register_model_architecture("transformer", "transformer_vaswani_wmt_en_fr_big")
+@register_model_architecture("transformer", "XXtransformer_vaswani_wmt_en_fr_bigXX")
def transformer_vaswani_wmt_en_fr_big(args):
args.dropout = getattr(args, "dropout", 0.1)
transformer_vaswani_wmt_en_de_big(args)
Mutant 315
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -975,8 +975,6 @@
args.dropout = getattr(args, "dropout", 0.3)
base_architecture(args)
-
-@register_model_architecture("transformer", "transformer_vaswani_wmt_en_fr_big")
def transformer_vaswani_wmt_en_fr_big(args):
args.dropout = getattr(args, "dropout", 0.1)
transformer_vaswani_wmt_en_de_big(args)
Mutant 317
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -982,7 +982,7 @@
transformer_vaswani_wmt_en_de_big(args)
-@register_model_architecture("transformer", "transformer_wmt_en_de_big")
+@register_model_architecture("transformer", "XXtransformer_wmt_en_de_bigXX")
def transformer_wmt_en_de_big(args):
args.attention_dropout = getattr(args, "attention_dropout", 0.1)
transformer_vaswani_wmt_en_de_big(args)
Mutant 318
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -981,8 +981,6 @@
args.dropout = getattr(args, "dropout", 0.1)
transformer_vaswani_wmt_en_de_big(args)
-
-@register_model_architecture("transformer", "transformer_wmt_en_de_big")
def transformer_wmt_en_de_big(args):
args.attention_dropout = getattr(args, "attention_dropout", 0.1)
transformer_vaswani_wmt_en_de_big(args)
Mutant 320
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -989,7 +989,7 @@
# default parameters used in tensor2tensor implementation
-@register_model_architecture("transformer", "transformer_wmt_en_de_big_t2t")
+@register_model_architecture("transformer", "XXtransformer_wmt_en_de_big_t2tXX")
def transformer_wmt_en_de_big_t2t(args):
args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True)
args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True)
Mutant 321
--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -987,9 +987,6 @@
args.attention_dropout = getattr(args, "attention_dropout", 0.1)
transformer_vaswani_wmt_en_de_big(args)
-
-# default parameters used in tensor2tensor implementation
-@register_model_architecture("transformer", "transformer_wmt_en_de_big_t2t")
def transformer_wmt_en_de_big_t2t(args):
args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True)
args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True)