fairseq/models/transformer_lm.py

Killed 25 out of 146 mutants

Survived

Survived mutation testing. These mutants show holes in your test suite.

Mutant 256

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -18,7 +18,7 @@
     CharacterTokenEmbedder,
 )
 
-DEFAULT_MAX_TARGET_POSITIONS = 1024
+DEFAULT_MAX_TARGET_POSITIONS = 1025
 
 
 @register_model('transformer_lm')

Mutant 257

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -18,7 +18,7 @@
     CharacterTokenEmbedder,
 )
 
-DEFAULT_MAX_TARGET_POSITIONS = 1024
+DEFAULT_MAX_TARGET_POSITIONS = None
 
 
 @register_model('transformer_lm')

Mutant 260

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -24,7 +24,6 @@
 @register_model('transformer_lm')
 class TransformerLanguageModel(FairseqLanguageModel):
 
-    @classmethod
     def hub_models(cls):
 
         def moses_fastbpe(path):

Mutant 261

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -45,7 +45,6 @@
     def __init__(self, decoder):
         super().__init__(decoder)
 
-    @staticmethod
     def add_args(parser):
         """Add model-specific arguments to the parser."""
         # fmt: off

Mutant 262

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -49,7 +49,7 @@
     def add_args(parser):
         """Add model-specific arguments to the parser."""
         # fmt: off
-        parser.add_argument('--activation-fn',
+        parser.add_argument('XX--activation-fnXX',
                             choices=utils.get_available_activation_fns(),
                             help='activation function to use')
         parser.add_argument('--dropout', type=float, metavar='D',

Mutant 263

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -51,7 +51,7 @@
         # fmt: off
         parser.add_argument('--activation-fn',
                             choices=utils.get_available_activation_fns(),
-                            help='activation function to use')
+                            help='XXactivation function to useXX')
         parser.add_argument('--dropout', type=float, metavar='D',
                             help='dropout probability')
         parser.add_argument('--attention-dropout', type=float, metavar='D',

Mutant 264

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -52,7 +52,7 @@
         parser.add_argument('--activation-fn',
                             choices=utils.get_available_activation_fns(),
                             help='activation function to use')
-        parser.add_argument('--dropout', type=float, metavar='D',
+        parser.add_argument('XX--dropoutXX', type=float, metavar='D',
                             help='dropout probability')
         parser.add_argument('--attention-dropout', type=float, metavar='D',
                             help='dropout probability for attention weights')

Mutant 265

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -52,7 +52,7 @@
         parser.add_argument('--activation-fn',
                             choices=utils.get_available_activation_fns(),
                             help='activation function to use')
-        parser.add_argument('--dropout', type=float, metavar='D',
+        parser.add_argument('--dropout', type=float, metavar='XXDXX',
                             help='dropout probability')
         parser.add_argument('--attention-dropout', type=float, metavar='D',
                             help='dropout probability for attention weights')

Mutant 266

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -53,7 +53,7 @@
                             choices=utils.get_available_activation_fns(),
                             help='activation function to use')
         parser.add_argument('--dropout', type=float, metavar='D',
-                            help='dropout probability')
+                            help='XXdropout probabilityXX')
         parser.add_argument('--attention-dropout', type=float, metavar='D',
                             help='dropout probability for attention weights')
         parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',

Mutant 267

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -54,7 +54,7 @@
                             help='activation function to use')
         parser.add_argument('--dropout', type=float, metavar='D',
                             help='dropout probability')
-        parser.add_argument('--attention-dropout', type=float, metavar='D',
+        parser.add_argument('XX--attention-dropoutXX', type=float, metavar='D',
                             help='dropout probability for attention weights')
         parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
                             help='dropout probability after activation in FFN.')

Mutant 268

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -54,7 +54,7 @@
                             help='activation function to use')
         parser.add_argument('--dropout', type=float, metavar='D',
                             help='dropout probability')
-        parser.add_argument('--attention-dropout', type=float, metavar='D',
+        parser.add_argument('--attention-dropout', type=float, metavar='XXDXX',
                             help='dropout probability for attention weights')
         parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
                             help='dropout probability after activation in FFN.')

Mutant 269

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -55,7 +55,7 @@
         parser.add_argument('--dropout', type=float, metavar='D',
                             help='dropout probability')
         parser.add_argument('--attention-dropout', type=float, metavar='D',
-                            help='dropout probability for attention weights')
+                            help='XXdropout probability for attention weightsXX')
         parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
                             help='dropout probability after activation in FFN.')
         parser.add_argument('--decoder-embed-dim', type=int, metavar='N',

Mutant 272

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -56,7 +56,7 @@
                             help='dropout probability')
         parser.add_argument('--attention-dropout', type=float, metavar='D',
                             help='dropout probability for attention weights')
-        parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
+        parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='XXDXX',
                             help='dropout probability after activation in FFN.')
         parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
                             help='decoder embedding dimension')

Mutant 273

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -57,7 +57,7 @@
         parser.add_argument('--attention-dropout', type=float, metavar='D',
                             help='dropout probability for attention weights')
         parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
-                            help='dropout probability after activation in FFN.')
+                            help='XXdropout probability after activation in FFN.XX')
         parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
                             help='decoder embedding dimension')
         parser.add_argument('--decoder-output-dim', type=int, metavar='N',

Mutant 274

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -58,7 +58,7 @@
                             help='dropout probability for attention weights')
         parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
                             help='dropout probability after activation in FFN.')
-        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
+        parser.add_argument('XX--decoder-embed-dimXX', type=int, metavar='N',
                             help='decoder embedding dimension')
         parser.add_argument('--decoder-output-dim', type=int, metavar='N',
                             help='decoder output dimension')

Mutant 275

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -58,7 +58,7 @@
                             help='dropout probability for attention weights')
         parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
                             help='dropout probability after activation in FFN.')
-        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
+        parser.add_argument('--decoder-embed-dim', type=int, metavar='XXNXX',
                             help='decoder embedding dimension')
         parser.add_argument('--decoder-output-dim', type=int, metavar='N',
                             help='decoder output dimension')

Mutant 276

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -59,7 +59,7 @@
         parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
                             help='dropout probability after activation in FFN.')
         parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
-                            help='decoder embedding dimension')
+                            help='XXdecoder embedding dimensionXX')
         parser.add_argument('--decoder-output-dim', type=int, metavar='N',
                             help='decoder output dimension')
         parser.add_argument('--decoder-input-dim', type=int, metavar='N',

Mutant 277

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -60,7 +60,7 @@
                             help='dropout probability after activation in FFN.')
         parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
                             help='decoder embedding dimension')
-        parser.add_argument('--decoder-output-dim', type=int, metavar='N',
+        parser.add_argument('XX--decoder-output-dimXX', type=int, metavar='N',
                             help='decoder output dimension')
         parser.add_argument('--decoder-input-dim', type=int, metavar='N',
                             help='decoder input dimension')

Mutant 278

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -60,7 +60,7 @@
                             help='dropout probability after activation in FFN.')
         parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
                             help='decoder embedding dimension')
-        parser.add_argument('--decoder-output-dim', type=int, metavar='N',
+        parser.add_argument('--decoder-output-dim', type=int, metavar='XXNXX',
                             help='decoder output dimension')
         parser.add_argument('--decoder-input-dim', type=int, metavar='N',
                             help='decoder input dimension')

Mutant 279

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -61,7 +61,7 @@
         parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
                             help='decoder embedding dimension')
         parser.add_argument('--decoder-output-dim', type=int, metavar='N',
-                            help='decoder output dimension')
+                            help='XXdecoder output dimensionXX')
         parser.add_argument('--decoder-input-dim', type=int, metavar='N',
                             help='decoder input dimension')
         parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',

Mutant 280

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -62,7 +62,7 @@
                             help='decoder embedding dimension')
         parser.add_argument('--decoder-output-dim', type=int, metavar='N',
                             help='decoder output dimension')
-        parser.add_argument('--decoder-input-dim', type=int, metavar='N',
+        parser.add_argument('XX--decoder-input-dimXX', type=int, metavar='N',
                             help='decoder input dimension')
         parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
                             help='decoder embedding dimension for FFN')

Mutant 281

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -62,7 +62,7 @@
                             help='decoder embedding dimension')
         parser.add_argument('--decoder-output-dim', type=int, metavar='N',
                             help='decoder output dimension')
-        parser.add_argument('--decoder-input-dim', type=int, metavar='N',
+        parser.add_argument('--decoder-input-dim', type=int, metavar='XXNXX',
                             help='decoder input dimension')
         parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
                             help='decoder embedding dimension for FFN')

Mutant 282

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -63,7 +63,7 @@
         parser.add_argument('--decoder-output-dim', type=int, metavar='N',
                             help='decoder output dimension')
         parser.add_argument('--decoder-input-dim', type=int, metavar='N',
-                            help='decoder input dimension')
+                            help='XXdecoder input dimensionXX')
         parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
                             help='decoder embedding dimension for FFN')
         parser.add_argument('--decoder-layers', type=int, metavar='N',

Mutant 283

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -64,7 +64,7 @@
                             help='decoder output dimension')
         parser.add_argument('--decoder-input-dim', type=int, metavar='N',
                             help='decoder input dimension')
-        parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
+        parser.add_argument('XX--decoder-ffn-embed-dimXX', type=int, metavar='N',
                             help='decoder embedding dimension for FFN')
         parser.add_argument('--decoder-layers', type=int, metavar='N',
                             help='num decoder layers')

Mutant 284

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -64,7 +64,7 @@
                             help='decoder output dimension')
         parser.add_argument('--decoder-input-dim', type=int, metavar='N',
                             help='decoder input dimension')
-        parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
+        parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='XXNXX',
                             help='decoder embedding dimension for FFN')
         parser.add_argument('--decoder-layers', type=int, metavar='N',
                             help='num decoder layers')

Mutant 285

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -65,7 +65,7 @@
         parser.add_argument('--decoder-input-dim', type=int, metavar='N',
                             help='decoder input dimension')
         parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
-                            help='decoder embedding dimension for FFN')
+                            help='XXdecoder embedding dimension for FFNXX')
         parser.add_argument('--decoder-layers', type=int, metavar='N',
                             help='num decoder layers')
         parser.add_argument('--decoder-attention-heads', type=int, metavar='N',

Mutant 286

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -66,7 +66,7 @@
                             help='decoder input dimension')
         parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
                             help='decoder embedding dimension for FFN')
-        parser.add_argument('--decoder-layers', type=int, metavar='N',
+        parser.add_argument('XX--decoder-layersXX', type=int, metavar='N',
                             help='num decoder layers')
         parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
                             help='num decoder attention heads')

Mutant 287

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -66,7 +66,7 @@
                             help='decoder input dimension')
         parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
                             help='decoder embedding dimension for FFN')
-        parser.add_argument('--decoder-layers', type=int, metavar='N',
+        parser.add_argument('--decoder-layers', type=int, metavar='XXNXX',
                             help='num decoder layers')
         parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
                             help='num decoder attention heads')

Mutant 288

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -67,7 +67,7 @@
         parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
                             help='decoder embedding dimension for FFN')
         parser.add_argument('--decoder-layers', type=int, metavar='N',
-                            help='num decoder layers')
+                            help='XXnum decoder layersXX')
         parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
                             help='num decoder attention heads')
         parser.add_argument('--decoder-normalize-before', action='store_true',

Mutant 289

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -68,7 +68,7 @@
                             help='decoder embedding dimension for FFN')
         parser.add_argument('--decoder-layers', type=int, metavar='N',
                             help='num decoder layers')
-        parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
+        parser.add_argument('XX--decoder-attention-headsXX', type=int, metavar='N',
                             help='num decoder attention heads')
         parser.add_argument('--decoder-normalize-before', action='store_true',
                             help='apply layernorm before each decoder block')

Mutant 290

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -68,7 +68,7 @@
                             help='decoder embedding dimension for FFN')
         parser.add_argument('--decoder-layers', type=int, metavar='N',
                             help='num decoder layers')
-        parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
+        parser.add_argument('--decoder-attention-heads', type=int, metavar='XXNXX',
                             help='num decoder attention heads')
         parser.add_argument('--decoder-normalize-before', action='store_true',
                             help='apply layernorm before each decoder block')

Mutant 291

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -69,7 +69,7 @@
         parser.add_argument('--decoder-layers', type=int, metavar='N',
                             help='num decoder layers')
         parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
-                            help='num decoder attention heads')
+                            help='XXnum decoder attention headsXX')
         parser.add_argument('--decoder-normalize-before', action='store_true',
                             help='apply layernorm before each decoder block')
         parser.add_argument('--no-decoder-final-norm', action='store_true',

Mutant 292

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -70,7 +70,7 @@
                             help='num decoder layers')
         parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
                             help='num decoder attention heads')
-        parser.add_argument('--decoder-normalize-before', action='store_true',
+        parser.add_argument('XX--decoder-normalize-beforeXX', action='store_true',
                             help='apply layernorm before each decoder block')
         parser.add_argument('--no-decoder-final-norm', action='store_true',
                             help='don\'t add an extra layernorm after the last decoder block')

Mutant 294

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -71,7 +71,7 @@
         parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
                             help='num decoder attention heads')
         parser.add_argument('--decoder-normalize-before', action='store_true',
-                            help='apply layernorm before each decoder block')
+                            help='XXapply layernorm before each decoder blockXX')
         parser.add_argument('--no-decoder-final-norm', action='store_true',
                             help='don\'t add an extra layernorm after the last decoder block')
         parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',

Mutant 295

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -72,7 +72,7 @@
                             help='num decoder attention heads')
         parser.add_argument('--decoder-normalize-before', action='store_true',
                             help='apply layernorm before each decoder block')
-        parser.add_argument('--no-decoder-final-norm', action='store_true',
+        parser.add_argument('XX--no-decoder-final-normXX', action='store_true',
                             help='don\'t add an extra layernorm after the last decoder block')
         parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
                             help='comma separated list of adaptive softmax cutoff points. '

Mutant 297

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -73,7 +73,7 @@
         parser.add_argument('--decoder-normalize-before', action='store_true',
                             help='apply layernorm before each decoder block')
         parser.add_argument('--no-decoder-final-norm', action='store_true',
-                            help='don\'t add an extra layernorm after the last decoder block')
+                            help='XXdon\'t add an extra layernorm after the last decoder blockXX')
         parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
                             help='comma separated list of adaptive softmax cutoff points. '
                                  'Must be used with adaptive_loss criterion')

Mutant 298

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -74,7 +74,7 @@
                             help='apply layernorm before each decoder block')
         parser.add_argument('--no-decoder-final-norm', action='store_true',
                             help='don\'t add an extra layernorm after the last decoder block')
-        parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
+        parser.add_argument('XX--adaptive-softmax-cutoffXX', metavar='EXPR',
                             help='comma separated list of adaptive softmax cutoff points. '
                                  'Must be used with adaptive_loss criterion')
         parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',

Mutant 299

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -74,7 +74,7 @@
                             help='apply layernorm before each decoder block')
         parser.add_argument('--no-decoder-final-norm', action='store_true',
                             help='don\'t add an extra layernorm after the last decoder block')
-        parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
+        parser.add_argument('--adaptive-softmax-cutoff', metavar='XXEXPRXX',
                             help='comma separated list of adaptive softmax cutoff points. '
                                  'Must be used with adaptive_loss criterion')
         parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',

Mutant 300

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -75,7 +75,7 @@
         parser.add_argument('--no-decoder-final-norm', action='store_true',
                             help='don\'t add an extra layernorm after the last decoder block')
         parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
-                            help='comma separated list of adaptive softmax cutoff points. '
+                            help='XXcomma separated list of adaptive softmax cutoff points. XX'
                                  'Must be used with adaptive_loss criterion')
         parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
                             help='sets adaptive softmax dropout for the tail projections')

Mutant 301

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -77,7 +77,7 @@
         parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
                             help='comma separated list of adaptive softmax cutoff points. '
                                  'Must be used with adaptive_loss criterion')
-        parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
+        parser.add_argument('XX--adaptive-softmax-dropoutXX', type=float, metavar='D',
                             help='sets adaptive softmax dropout for the tail projections')
         parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N',
                             help='adaptive input factor')

Mutant 302

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -77,7 +77,7 @@
         parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
                             help='comma separated list of adaptive softmax cutoff points. '
                                  'Must be used with adaptive_loss criterion')
-        parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
+        parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='XXDXX',
                             help='sets adaptive softmax dropout for the tail projections')
         parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N',
                             help='adaptive input factor')

Mutant 303

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -78,7 +78,7 @@
                             help='comma separated list of adaptive softmax cutoff points. '
                                  'Must be used with adaptive_loss criterion')
         parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
-                            help='sets adaptive softmax dropout for the tail projections')
+                            help='XXsets adaptive softmax dropout for the tail projectionsXX')
         parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N',
                             help='adaptive input factor')
         parser.add_argument('--no-token-positional-embeddings', action='store_true',

Mutant 304

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -79,7 +79,7 @@
                                  'Must be used with adaptive_loss criterion')
         parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
                             help='sets adaptive softmax dropout for the tail projections')
-        parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N',
+        parser.add_argument('XX--adaptive-softmax-factorXX', type=float, metavar='N',
                             help='adaptive input factor')
         parser.add_argument('--no-token-positional-embeddings', action='store_true',
                             help='if set, disables positional embeddings (outside self attention)')

Mutant 305

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -79,7 +79,7 @@
                                  'Must be used with adaptive_loss criterion')
         parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
                             help='sets adaptive softmax dropout for the tail projections')
-        parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N',
+        parser.add_argument('--adaptive-softmax-factor', type=float, metavar='XXNXX',
                             help='adaptive input factor')
         parser.add_argument('--no-token-positional-embeddings', action='store_true',
                             help='if set, disables positional embeddings (outside self attention)')

Mutant 306

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -80,7 +80,7 @@
         parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
                             help='sets adaptive softmax dropout for the tail projections')
         parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N',
-                            help='adaptive input factor')
+                            help='XXadaptive input factorXX')
         parser.add_argument('--no-token-positional-embeddings', action='store_true',
                             help='if set, disables positional embeddings (outside self attention)')
         parser.add_argument('--share-decoder-input-output-embed', action='store_true',

Mutant 307

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -81,7 +81,7 @@
                             help='sets adaptive softmax dropout for the tail projections')
         parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N',
                             help='adaptive input factor')
-        parser.add_argument('--no-token-positional-embeddings', action='store_true',
+        parser.add_argument('XX--no-token-positional-embeddingsXX', action='store_true',
                             help='if set, disables positional embeddings (outside self attention)')
         parser.add_argument('--share-decoder-input-output-embed', action='store_true',
                             help='share decoder input and output embeddings')

Mutant 309

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -82,7 +82,7 @@
         parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N',
                             help='adaptive input factor')
         parser.add_argument('--no-token-positional-embeddings', action='store_true',
-                            help='if set, disables positional embeddings (outside self attention)')
+                            help='XXif set, disables positional embeddings (outside self attention)XX')
         parser.add_argument('--share-decoder-input-output-embed', action='store_true',
                             help='share decoder input and output embeddings')
         parser.add_argument('--character-embeddings', action='store_true',

Mutant 310

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -83,7 +83,7 @@
                             help='adaptive input factor')
         parser.add_argument('--no-token-positional-embeddings', action='store_true',
                             help='if set, disables positional embeddings (outside self attention)')
-        parser.add_argument('--share-decoder-input-output-embed', action='store_true',
+        parser.add_argument('XX--share-decoder-input-output-embedXX', action='store_true',
                             help='share decoder input and output embeddings')
         parser.add_argument('--character-embeddings', action='store_true',
                             help='if set, uses character embedding convolutions to produce token embeddings')

Mutant 312

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -84,7 +84,7 @@
         parser.add_argument('--no-token-positional-embeddings', action='store_true',
                             help='if set, disables positional embeddings (outside self attention)')
         parser.add_argument('--share-decoder-input-output-embed', action='store_true',
-                            help='share decoder input and output embeddings')
+                            help='XXshare decoder input and output embeddingsXX')
         parser.add_argument('--character-embeddings', action='store_true',
                             help='if set, uses character embedding convolutions to produce token embeddings')
         parser.add_argument('--character-filters', type=str, metavar='LIST',

Mutant 313

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -85,7 +85,7 @@
                             help='if set, disables positional embeddings (outside self attention)')
         parser.add_argument('--share-decoder-input-output-embed', action='store_true',
                             help='share decoder input and output embeddings')
-        parser.add_argument('--character-embeddings', action='store_true',
+        parser.add_argument('XX--character-embeddingsXX', action='store_true',
                             help='if set, uses character embedding convolutions to produce token embeddings')
         parser.add_argument('--character-filters', type=str, metavar='LIST',
                             default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',

Mutant 315

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -86,7 +86,7 @@
         parser.add_argument('--share-decoder-input-output-embed', action='store_true',
                             help='share decoder input and output embeddings')
         parser.add_argument('--character-embeddings', action='store_true',
-                            help='if set, uses character embedding convolutions to produce token embeddings')
+                            help='XXif set, uses character embedding convolutions to produce token embeddingsXX')
         parser.add_argument('--character-filters', type=str, metavar='LIST',
                             default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
                             help='size of character embeddings')

Mutant 316

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -87,7 +87,7 @@
                             help='share decoder input and output embeddings')
         parser.add_argument('--character-embeddings', action='store_true',
                             help='if set, uses character embedding convolutions to produce token embeddings')
-        parser.add_argument('--character-filters', type=str, metavar='LIST',
+        parser.add_argument('XX--character-filtersXX', type=str, metavar='LIST',
                             default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
                             help='size of character embeddings')
         parser.add_argument('--character-embedding-dim', default=4, type=int, metavar='N',

Mutant 317

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -87,7 +87,7 @@
                             help='share decoder input and output embeddings')
         parser.add_argument('--character-embeddings', action='store_true',
                             help='if set, uses character embedding convolutions to produce token embeddings')
-        parser.add_argument('--character-filters', type=str, metavar='LIST',
+        parser.add_argument('--character-filters', type=str, metavar='XXLISTXX',
                             default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
                             help='size of character embeddings')
         parser.add_argument('--character-embedding-dim', default=4, type=int, metavar='N',

Mutant 318

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -88,7 +88,7 @@
         parser.add_argument('--character-embeddings', action='store_true',
                             help='if set, uses character embedding convolutions to produce token embeddings')
         parser.add_argument('--character-filters', type=str, metavar='LIST',
-                            default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
+                            default='XX[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]XX',
                             help='size of character embeddings')
         parser.add_argument('--character-embedding-dim', default=4, type=int, metavar='N',
                             help='size of character embeddings')

Mutant 319

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -89,7 +89,7 @@
                             help='if set, uses character embedding convolutions to produce token embeddings')
         parser.add_argument('--character-filters', type=str, metavar='LIST',
                             default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
-                            help='size of character embeddings')
+                            help='XXsize of character embeddingsXX')
         parser.add_argument('--character-embedding-dim', default=4, type=int, metavar='N',
                             help='size of character embeddings')
         parser.add_argument('--char-embedder-highway-layers', default=2, type=int, metavar='N',

Mutant 320

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -90,7 +90,7 @@
         parser.add_argument('--character-filters', type=str, metavar='LIST',
                             default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
                             help='size of character embeddings')
-        parser.add_argument('--character-embedding-dim', default=4, type=int, metavar='N',
+        parser.add_argument('XX--character-embedding-dimXX', default=4, type=int, metavar='N',
                             help='size of character embeddings')
         parser.add_argument('--char-embedder-highway-layers', default=2, type=int, metavar='N',
                             help='number of highway layers for character token embeddder')

Mutant 321

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -90,7 +90,7 @@
         parser.add_argument('--character-filters', type=str, metavar='LIST',
                             default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
                             help='size of character embeddings')
-        parser.add_argument('--character-embedding-dim', default=4, type=int, metavar='N',
+        parser.add_argument('--character-embedding-dim', default=5, type=int, metavar='N',
                             help='size of character embeddings')
         parser.add_argument('--char-embedder-highway-layers', default=2, type=int, metavar='N',
                             help='number of highway layers for character token embeddder')

Mutant 322

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -90,7 +90,7 @@
         parser.add_argument('--character-filters', type=str, metavar='LIST',
                             default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
                             help='size of character embeddings')
-        parser.add_argument('--character-embedding-dim', default=4, type=int, metavar='N',
+        parser.add_argument('--character-embedding-dim', default=4, type=int, metavar='XXNXX',
                             help='size of character embeddings')
         parser.add_argument('--char-embedder-highway-layers', default=2, type=int, metavar='N',
                             help='number of highway layers for character token embeddder')

Mutant 323

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -91,7 +91,7 @@
                             default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
                             help='size of character embeddings')
         parser.add_argument('--character-embedding-dim', default=4, type=int, metavar='N',
-                            help='size of character embeddings')
+                            help='XXsize of character embeddingsXX')
         parser.add_argument('--char-embedder-highway-layers', default=2, type=int, metavar='N',
                             help='number of highway layers for character token embeddder')
         parser.add_argument('--adaptive-input', action='store_true',

Mutant 324

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -92,7 +92,7 @@
                             help='size of character embeddings')
         parser.add_argument('--character-embedding-dim', default=4, type=int, metavar='N',
                             help='size of character embeddings')
-        parser.add_argument('--char-embedder-highway-layers', default=2, type=int, metavar='N',
+        parser.add_argument('XX--char-embedder-highway-layersXX', default=2, type=int, metavar='N',
                             help='number of highway layers for character token embeddder')
         parser.add_argument('--adaptive-input', action='store_true',
                             help='if set, uses adaptive input')

Mutant 325

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -92,7 +92,7 @@
                             help='size of character embeddings')
         parser.add_argument('--character-embedding-dim', default=4, type=int, metavar='N',
                             help='size of character embeddings')
-        parser.add_argument('--char-embedder-highway-layers', default=2, type=int, metavar='N',
+        parser.add_argument('--char-embedder-highway-layers', default=3, type=int, metavar='N',
                             help='number of highway layers for character token embeddder')
         parser.add_argument('--adaptive-input', action='store_true',
                             help='if set, uses adaptive input')

Mutant 326

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -92,7 +92,7 @@
                             help='size of character embeddings')
         parser.add_argument('--character-embedding-dim', default=4, type=int, metavar='N',
                             help='size of character embeddings')
-        parser.add_argument('--char-embedder-highway-layers', default=2, type=int, metavar='N',
+        parser.add_argument('--char-embedder-highway-layers', default=2, type=int, metavar='XXNXX',
                             help='number of highway layers for character token embeddder')
         parser.add_argument('--adaptive-input', action='store_true',
                             help='if set, uses adaptive input')

Mutant 327

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -93,7 +93,7 @@
         parser.add_argument('--character-embedding-dim', default=4, type=int, metavar='N',
                             help='size of character embeddings')
         parser.add_argument('--char-embedder-highway-layers', default=2, type=int, metavar='N',
-                            help='number of highway layers for character token embeddder')
+                            help='XXnumber of highway layers for character token embeddderXX')
         parser.add_argument('--adaptive-input', action='store_true',
                             help='if set, uses adaptive input')
         parser.add_argument('--adaptive-input-factor', type=float, metavar='N',

Mutant 328

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -94,7 +94,7 @@
                             help='size of character embeddings')
         parser.add_argument('--char-embedder-highway-layers', default=2, type=int, metavar='N',
                             help='number of highway layers for character token embeddder')
-        parser.add_argument('--adaptive-input', action='store_true',
+        parser.add_argument('XX--adaptive-inputXX', action='store_true',
                             help='if set, uses adaptive input')
         parser.add_argument('--adaptive-input-factor', type=float, metavar='N',
                             help='adaptive input factor')

Mutant 330

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -95,7 +95,7 @@
         parser.add_argument('--char-embedder-highway-layers', default=2, type=int, metavar='N',
                             help='number of highway layers for character token embeddder')
         parser.add_argument('--adaptive-input', action='store_true',
-                            help='if set, uses adaptive input')
+                            help='XXif set, uses adaptive inputXX')
         parser.add_argument('--adaptive-input-factor', type=float, metavar='N',
                             help='adaptive input factor')
         parser.add_argument('--adaptive-input-cutoff', metavar='EXPR',

Mutant 331

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -96,7 +96,7 @@
                             help='number of highway layers for character token embeddder')
         parser.add_argument('--adaptive-input', action='store_true',
                             help='if set, uses adaptive input')
-        parser.add_argument('--adaptive-input-factor', type=float, metavar='N',
+        parser.add_argument('XX--adaptive-input-factorXX', type=float, metavar='N',
                             help='adaptive input factor')
         parser.add_argument('--adaptive-input-cutoff', metavar='EXPR',
                             help='comma separated list of adaptive input cutoff points.')

Mutant 332

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -96,7 +96,7 @@
                             help='number of highway layers for character token embeddder')
         parser.add_argument('--adaptive-input', action='store_true',
                             help='if set, uses adaptive input')
-        parser.add_argument('--adaptive-input-factor', type=float, metavar='N',
+        parser.add_argument('--adaptive-input-factor', type=float, metavar='XXNXX',
                             help='adaptive input factor')
         parser.add_argument('--adaptive-input-cutoff', metavar='EXPR',
                             help='comma separated list of adaptive input cutoff points.')

Mutant 333

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -97,7 +97,7 @@
         parser.add_argument('--adaptive-input', action='store_true',
                             help='if set, uses adaptive input')
         parser.add_argument('--adaptive-input-factor', type=float, metavar='N',
-                            help='adaptive input factor')
+                            help='XXadaptive input factorXX')
         parser.add_argument('--adaptive-input-cutoff', metavar='EXPR',
                             help='comma separated list of adaptive input cutoff points.')
         parser.add_argument('--tie-adaptive-weights', action='store_true',

Mutant 334

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -98,7 +98,7 @@
                             help='if set, uses adaptive input')
         parser.add_argument('--adaptive-input-factor', type=float, metavar='N',
                             help='adaptive input factor')
-        parser.add_argument('--adaptive-input-cutoff', metavar='EXPR',
+        parser.add_argument('XX--adaptive-input-cutoffXX', metavar='EXPR',
                             help='comma separated list of adaptive input cutoff points.')
         parser.add_argument('--tie-adaptive-weights', action='store_true',
                             help='if set, ties the weights of adaptive softmax and adaptive input')

Mutant 335

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -98,7 +98,7 @@
                             help='if set, uses adaptive input')
         parser.add_argument('--adaptive-input-factor', type=float, metavar='N',
                             help='adaptive input factor')
-        parser.add_argument('--adaptive-input-cutoff', metavar='EXPR',
+        parser.add_argument('--adaptive-input-cutoff', metavar='XXEXPRXX',
                             help='comma separated list of adaptive input cutoff points.')
         parser.add_argument('--tie-adaptive-weights', action='store_true',
                             help='if set, ties the weights of adaptive softmax and adaptive input')

Mutant 336

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -99,7 +99,7 @@
         parser.add_argument('--adaptive-input-factor', type=float, metavar='N',
                             help='adaptive input factor')
         parser.add_argument('--adaptive-input-cutoff', metavar='EXPR',
-                            help='comma separated list of adaptive input cutoff points.')
+                            help='XXcomma separated list of adaptive input cutoff points.XX')
         parser.add_argument('--tie-adaptive-weights', action='store_true',
                             help='if set, ties the weights of adaptive softmax and adaptive input')
         parser.add_argument('--tie-adaptive-proj', action='store_true',

Mutant 337

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -100,7 +100,7 @@
                             help='adaptive input factor')
         parser.add_argument('--adaptive-input-cutoff', metavar='EXPR',
                             help='comma separated list of adaptive input cutoff points.')
-        parser.add_argument('--tie-adaptive-weights', action='store_true',
+        parser.add_argument('XX--tie-adaptive-weightsXX', action='store_true',
                             help='if set, ties the weights of adaptive softmax and adaptive input')
         parser.add_argument('--tie-adaptive-proj', action='store_true',
                             help='if set, ties the projection weights of adaptive softmax and adaptive input')

Mutant 339

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -101,7 +101,7 @@
         parser.add_argument('--adaptive-input-cutoff', metavar='EXPR',
                             help='comma separated list of adaptive input cutoff points.')
         parser.add_argument('--tie-adaptive-weights', action='store_true',
-                            help='if set, ties the weights of adaptive softmax and adaptive input')
+                            help='XXif set, ties the weights of adaptive softmax and adaptive inputXX')
         parser.add_argument('--tie-adaptive-proj', action='store_true',
                             help='if set, ties the projection weights of adaptive softmax and adaptive input')
         parser.add_argument('--decoder-learned-pos', action='store_true',

Mutant 340

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -102,7 +102,7 @@
                             help='comma separated list of adaptive input cutoff points.')
         parser.add_argument('--tie-adaptive-weights', action='store_true',
                             help='if set, ties the weights of adaptive softmax and adaptive input')
-        parser.add_argument('--tie-adaptive-proj', action='store_true',
+        parser.add_argument('XX--tie-adaptive-projXX', action='store_true',
                             help='if set, ties the projection weights of adaptive softmax and adaptive input')
         parser.add_argument('--decoder-learned-pos', action='store_true',
                             help='use learned positional embeddings in the decoder')

Mutant 342

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -103,7 +103,7 @@
         parser.add_argument('--tie-adaptive-weights', action='store_true',
                             help='if set, ties the weights of adaptive softmax and adaptive input')
         parser.add_argument('--tie-adaptive-proj', action='store_true',
-                            help='if set, ties the projection weights of adaptive softmax and adaptive input')
+                            help='XXif set, ties the projection weights of adaptive softmax and adaptive inputXX')
         parser.add_argument('--decoder-learned-pos', action='store_true',
                             help='use learned positional embeddings in the decoder')
         parser.add_argument('--layernorm-embedding', action='store_true',

Mutant 343

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -104,7 +104,7 @@
                             help='if set, ties the weights of adaptive softmax and adaptive input')
         parser.add_argument('--tie-adaptive-proj', action='store_true',
                             help='if set, ties the projection weights of adaptive softmax and adaptive input')
-        parser.add_argument('--decoder-learned-pos', action='store_true',
+        parser.add_argument('XX--decoder-learned-posXX', action='store_true',
                             help='use learned positional embeddings in the decoder')
         parser.add_argument('--layernorm-embedding', action='store_true',
                             help='add layernorm to embedding')

Mutant 345

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -105,7 +105,7 @@
         parser.add_argument('--tie-adaptive-proj', action='store_true',
                             help='if set, ties the projection weights of adaptive softmax and adaptive input')
         parser.add_argument('--decoder-learned-pos', action='store_true',
-                            help='use learned positional embeddings in the decoder')
+                            help='XXuse learned positional embeddings in the decoderXX')
         parser.add_argument('--layernorm-embedding', action='store_true',
                             help='add layernorm to embedding')
         parser.add_argument('--no-scale-embedding', action='store_true',

Mutant 346

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -106,7 +106,7 @@
                             help='if set, ties the projection weights of adaptive softmax and adaptive input')
         parser.add_argument('--decoder-learned-pos', action='store_true',
                             help='use learned positional embeddings in the decoder')
-        parser.add_argument('--layernorm-embedding', action='store_true',
+        parser.add_argument('XX--layernorm-embeddingXX', action='store_true',
                             help='add layernorm to embedding')
         parser.add_argument('--no-scale-embedding', action='store_true',
                             help='if True, dont scale embeddings')

Mutant 348

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -107,7 +107,7 @@
         parser.add_argument('--decoder-learned-pos', action='store_true',
                             help='use learned positional embeddings in the decoder')
         parser.add_argument('--layernorm-embedding', action='store_true',
-                            help='add layernorm to embedding')
+                            help='XXadd layernorm to embeddingXX')
         parser.add_argument('--no-scale-embedding', action='store_true',
                             help='if True, dont scale embeddings')
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)

Mutant 349

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -108,7 +108,7 @@
                             help='use learned positional embeddings in the decoder')
         parser.add_argument('--layernorm-embedding', action='store_true',
                             help='add layernorm to embedding')
-        parser.add_argument('--no-scale-embedding', action='store_true',
+        parser.add_argument('XX--no-scale-embeddingXX', action='store_true',
                             help='if True, dont scale embeddings')
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
         parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,

Mutant 351

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -109,7 +109,7 @@
         parser.add_argument('--layernorm-embedding', action='store_true',
                             help='add layernorm to embedding')
         parser.add_argument('--no-scale-embedding', action='store_true',
-                            help='if True, dont scale embeddings')
+                            help='XXif True, dont scale embeddingsXX')
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
         parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
                             help='LayerDrop probability for decoder')

Mutant 352

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -111,7 +111,7 @@
         parser.add_argument('--no-scale-embedding', action='store_true',
                             help='if True, dont scale embeddings')
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
-        parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
+        parser.add_argument('XX--decoder-layerdropXX', type=float, metavar='D', default=0,
                             help='LayerDrop probability for decoder')
         parser.add_argument('--decoder-layers-to-keep', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')

Mutant 353

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -111,7 +111,7 @@
         parser.add_argument('--no-scale-embedding', action='store_true',
                             help='if True, dont scale embeddings')
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
-        parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
+        parser.add_argument('--decoder-layerdrop', type=float, metavar='XXDXX', default=0,
                             help='LayerDrop probability for decoder')
         parser.add_argument('--decoder-layers-to-keep', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')

Mutant 354

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -111,7 +111,7 @@
         parser.add_argument('--no-scale-embedding', action='store_true',
                             help='if True, dont scale embeddings')
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
-        parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
+        parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=1,
                             help='LayerDrop probability for decoder')
         parser.add_argument('--decoder-layers-to-keep', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')

Mutant 355

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -112,7 +112,7 @@
                             help='if True, dont scale embeddings')
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
         parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
-                            help='LayerDrop probability for decoder')
+                            help='XXLayerDrop probability for decoderXX')
         parser.add_argument('--decoder-layers-to-keep', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)

Mutant 356

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -113,7 +113,7 @@
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
         parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
                             help='LayerDrop probability for decoder')
-        parser.add_argument('--decoder-layers-to-keep', default=None,
+        parser.add_argument('XX--decoder-layers-to-keepXX', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
         parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,

Mutant 357

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -114,7 +114,7 @@
         parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
                             help='LayerDrop probability for decoder')
         parser.add_argument('--decoder-layers-to-keep', default=None,
-                            help='which layers to *keep* when pruning as a comma-separated list')
+                            help='XXwhich layers to *keep* when pruning as a comma-separated listXX')
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
         parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
                             help='iterative PQ quantization noise at training time')

Mutant 358

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -116,7 +116,7 @@
         parser.add_argument('--decoder-layers-to-keep', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
-        parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
+        parser.add_argument('XX--quant-noise-pqXX', type=float, metavar='D', default=0,
                             help='iterative PQ quantization noise at training time')
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')

Mutant 359

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -116,7 +116,7 @@
         parser.add_argument('--decoder-layers-to-keep', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
-        parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
+        parser.add_argument('--quant-noise-pq', type=float, metavar='XXDXX', default=0,
                             help='iterative PQ quantization noise at training time')
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')

Mutant 360

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -116,7 +116,7 @@
         parser.add_argument('--decoder-layers-to-keep', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
-        parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
+        parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=1,
                             help='iterative PQ quantization noise at training time')
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')

Mutant 361

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -117,7 +117,7 @@
                             help='which layers to *keep* when pruning as a comma-separated list')
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
         parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
-                            help='iterative PQ quantization noise at training time')
+                            help='XXiterative PQ quantization noise at training timeXX')
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')
         parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,

Mutant 362

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -118,7 +118,7 @@
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
         parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
                             help='iterative PQ quantization noise at training time')
-        parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
+        parser.add_argument('XX--quant-noise-pq-block-sizeXX', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')
         parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
                             help='scalar quantization noise and scalar quantization at training time')

Mutant 363

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -118,7 +118,7 @@
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
         parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
                             help='iterative PQ quantization noise at training time')
-        parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
+        parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='XXDXX', default=8,
                             help='block size of quantization noise at training time')
         parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
                             help='scalar quantization noise and scalar quantization at training time')

Mutant 364

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -118,7 +118,7 @@
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
         parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
                             help='iterative PQ quantization noise at training time')
-        parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
+        parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=9,
                             help='block size of quantization noise at training time')
         parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
                             help='scalar quantization noise and scalar quantization at training time')

Mutant 365

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -119,7 +119,7 @@
         parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
                             help='iterative PQ quantization noise at training time')
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
-                            help='block size of quantization noise at training time')
+                            help='XXblock size of quantization noise at training timeXX')
         parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
                             help='scalar quantization noise and scalar quantization at training time')
         # fmt: on

Mutant 366

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -120,7 +120,7 @@
                             help='iterative PQ quantization noise at training time')
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')
-        parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
+        parser.add_argument('XX--quant-noise-scalarXX', type=float, metavar='D', default=0,
                             help='scalar quantization noise and scalar quantization at training time')
         # fmt: on
 

Mutant 367

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -120,7 +120,7 @@
                             help='iterative PQ quantization noise at training time')
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')
-        parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
+        parser.add_argument('--quant-noise-scalar', type=float, metavar='XXDXX', default=0,
                             help='scalar quantization noise and scalar quantization at training time')
         # fmt: on
 

Mutant 368

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -120,7 +120,7 @@
                             help='iterative PQ quantization noise at training time')
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')
-        parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
+        parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=1,
                             help='scalar quantization noise and scalar quantization at training time')
         # fmt: on
 

Mutant 369

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -121,7 +121,7 @@
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')
         parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
-                            help='scalar quantization noise and scalar quantization at training time')
+                            help='XXscalar quantization noise and scalar quantization at training timeXX')
         # fmt: on
 
     @classmethod

Mutant 370

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -122,9 +122,7 @@
                             help='block size of quantization noise at training time')
         parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
                             help='scalar quantization noise and scalar quantization at training time')
-        # fmt: on
-
-    @classmethod
+
     def build_model(cls, args, task):
         """Build a new model instance."""
 

Mutant 371

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -165,7 +165,6 @@
         )
         return cls(decoder)
 
-    @classmethod
     def build_embedding(cls, args, dictionary, embed_dim, path=None):
         embed_tokens = Embedding(len(dictionary), embed_dim, dictionary.pad())
         return embed_tokens

Mutant 373

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -171,7 +171,7 @@
         return embed_tokens
 
 
-@register_model_architecture('transformer_lm', 'transformer_lm')
+@register_model_architecture('transformer_lm', 'XXtransformer_lmXX')
 def base_lm_architecture(args):
     # backward compatibility for older model checkpoints
     if hasattr(args, 'no_tie_adaptive_proj'):

Mutant 374

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -170,8 +170,6 @@
         embed_tokens = Embedding(len(dictionary), embed_dim, dictionary.pad())
         return embed_tokens
 
-
-@register_model_architecture('transformer_lm', 'transformer_lm')
 def base_lm_architecture(args):
     # backward compatibility for older model checkpoints
     if hasattr(args, 'no_tie_adaptive_proj'):

Mutant 376

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -219,7 +219,7 @@
     args.layernorm_embedding = getattr(args, 'layernorm_embedding', False)
 
 
-@register_model_architecture('transformer_lm', 'transformer_lm_big')
+@register_model_architecture('transformer_lm', 'XXtransformer_lm_bigXX')
 def transformer_lm_big(args):
     args.decoder_layers = getattr(args, 'decoder_layers', 12)
     args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1024)

Mutant 377

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -218,8 +218,6 @@
     args.no_scale_embedding = getattr(args, 'no_scale_embedding', False)
     args.layernorm_embedding = getattr(args, 'layernorm_embedding', False)
 
-
-@register_model_architecture('transformer_lm', 'transformer_lm_big')
 def transformer_lm_big(args):
     args.decoder_layers = getattr(args, 'decoder_layers', 12)
     args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1024)

Mutant 379

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -228,7 +228,7 @@
     base_lm_architecture(args)
 
 
-@register_model_architecture('transformer_lm', 'transformer_lm_wiki103')
+@register_model_architecture('transformer_lm', 'XXtransformer_lm_wiki103XX')
 @register_model_architecture('transformer_lm', 'transformer_lm_baevski_wiki103')
 def transformer_lm_baevski_wiki103(args):
     args.decoder_layers = getattr(args, 'decoder_layers', 16)

Mutant 380

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -227,8 +227,6 @@
     args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 16)
     base_lm_architecture(args)
 
-
-@register_model_architecture('transformer_lm', 'transformer_lm_wiki103')
 @register_model_architecture('transformer_lm', 'transformer_lm_baevski_wiki103')
 def transformer_lm_baevski_wiki103(args):
     args.decoder_layers = getattr(args, 'decoder_layers', 16)

Mutant 382

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -229,7 +229,7 @@
 
 
 @register_model_architecture('transformer_lm', 'transformer_lm_wiki103')
-@register_model_architecture('transformer_lm', 'transformer_lm_baevski_wiki103')
+@register_model_architecture('transformer_lm', 'XXtransformer_lm_baevski_wiki103XX')
 def transformer_lm_baevski_wiki103(args):
     args.decoder_layers = getattr(args, 'decoder_layers', 16)
     args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8)

Mutant 383

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -229,7 +229,7 @@
 
 
 @register_model_architecture('transformer_lm', 'transformer_lm_wiki103')
-@register_model_architecture('transformer_lm', 'transformer_lm_baevski_wiki103')
+
 def transformer_lm_baevski_wiki103(args):
     args.decoder_layers = getattr(args, 'decoder_layers', 16)
     args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8)

Mutant 385

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -246,7 +246,7 @@
     transformer_lm_big(args)
 
 
-@register_model_architecture('transformer_lm', 'transformer_lm_gbw')
+@register_model_architecture('transformer_lm', 'XXtransformer_lm_gbwXX')
 @register_model_architecture('transformer_lm', 'transformer_lm_baevski_gbw')
 def transformer_lm_baevski_gbw(args):
     args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)

Mutant 386

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -245,8 +245,6 @@
     args.tie_adaptive_proj = getattr(args, 'tie_adaptive_proj', True)
     transformer_lm_big(args)
 
-
-@register_model_architecture('transformer_lm', 'transformer_lm_gbw')
 @register_model_architecture('transformer_lm', 'transformer_lm_baevski_gbw')
 def transformer_lm_baevski_gbw(args):
     args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)

Mutant 388

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -247,7 +247,7 @@
 
 
 @register_model_architecture('transformer_lm', 'transformer_lm_gbw')
-@register_model_architecture('transformer_lm', 'transformer_lm_baevski_gbw')
+@register_model_architecture('transformer_lm', 'XXtransformer_lm_baevski_gbwXX')
 def transformer_lm_baevski_gbw(args):
     args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
     args.dropout = getattr(args, 'dropout', 0.1)

Mutant 389

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -247,7 +247,7 @@
 
 
 @register_model_architecture('transformer_lm', 'transformer_lm_gbw')
-@register_model_architecture('transformer_lm', 'transformer_lm_baevski_gbw')
+
 def transformer_lm_baevski_gbw(args):
     args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
     args.dropout = getattr(args, 'dropout', 0.1)

Mutant 391

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -256,7 +256,7 @@
     transformer_lm_big(args)
 
 
-@register_model_architecture('transformer_lm', 'transformer_lm_gpt')
+@register_model_architecture('transformer_lm', 'XXtransformer_lm_gptXX')
 def transformer_lm_gpt(args):
     args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 768)
     args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 3072)

Mutant 392

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -255,8 +255,6 @@
     args.no_decoder_final_norm = getattr(args, 'no_decoder_final_norm', True)
     transformer_lm_big(args)
 
-
-@register_model_architecture('transformer_lm', 'transformer_lm_gpt')
 def transformer_lm_gpt(args):
     args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 768)
     args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 3072)

Mutant 394

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -268,7 +268,7 @@
     base_lm_architecture(args)
 
 
-@register_model_architecture('transformer_lm', 'transformer_lm_gpt2_small')
+@register_model_architecture('transformer_lm', 'XXtransformer_lm_gpt2_smallXX')
 def transformer_lm_gpt2_small(args):
     args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1024)
     args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 4096)

Mutant 395

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -267,8 +267,6 @@
     args.activation_fn = getattr(args, 'activation_fn', 'gelu')
     base_lm_architecture(args)
 
-
-@register_model_architecture('transformer_lm', 'transformer_lm_gpt2_small')
 def transformer_lm_gpt2_small(args):
     args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1024)
     args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 4096)

Mutant 397

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -280,7 +280,7 @@
     base_lm_architecture(args)
 
 
-@register_model_architecture('transformer_lm', 'transformer_lm_gpt2_medium')
+@register_model_architecture('transformer_lm', 'XXtransformer_lm_gpt2_mediumXX')
 def transformer_lm_gpt2_medium(args):
     args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1280)
     args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 5120)

Mutant 398

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -279,8 +279,6 @@
     args.activation_fn = getattr(args, 'activation_fn', 'gelu')
     base_lm_architecture(args)
 
-
-@register_model_architecture('transformer_lm', 'transformer_lm_gpt2_medium')
 def transformer_lm_gpt2_medium(args):
     args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1280)
     args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 5120)

Mutant 400

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -292,7 +292,7 @@
     base_lm_architecture(args)
 
 
-@register_model_architecture('transformer_lm', 'transformer_lm_gpt2_big')
+@register_model_architecture('transformer_lm', 'XXtransformer_lm_gpt2_bigXX')
 def transformer_lm_gpt2_big(args):
     args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1600)
     args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 6400)

Mutant 401

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -291,8 +291,6 @@
     args.activation_fn = getattr(args, 'activation_fn', 'gelu')
     base_lm_architecture(args)
 
-
-@register_model_architecture('transformer_lm', 'transformer_lm_gpt2_big')
 def transformer_lm_gpt2_big(args):
     args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1600)
     args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 6400)