fairseq/models/transformer_lm.py

Killed 25 out of 146 mutants

Survived

Survived mutation testing. These mutants show holes in your test suite.

Mutant 1510

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -18,7 +18,7 @@
     CharacterTokenEmbedder,
 )
 
-DEFAULT_MAX_TARGET_POSITIONS = 1024
+DEFAULT_MAX_TARGET_POSITIONS = 1025
 
 
 @register_model('transformer_lm')

Mutant 1511

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -18,7 +18,7 @@
     CharacterTokenEmbedder,
 )
 
-DEFAULT_MAX_TARGET_POSITIONS = 1024
+DEFAULT_MAX_TARGET_POSITIONS = None
 
 
 @register_model('transformer_lm')

Mutant 1514

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -24,7 +24,6 @@
 @register_model('transformer_lm')
 class TransformerLanguageModel(FairseqLanguageModel):
 
-    @classmethod
     def hub_models(cls):
 
         def moses_fastbpe(path):

Mutant 1515

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -45,7 +45,6 @@
     def __init__(self, decoder):
         super().__init__(decoder)
 
-    @staticmethod
     def add_args(parser):
         """Add model-specific arguments to the parser."""
         # fmt: off

Mutant 1516

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -49,7 +49,7 @@
     def add_args(parser):
         """Add model-specific arguments to the parser."""
         # fmt: off
-        parser.add_argument('--activation-fn',
+        parser.add_argument('XX--activation-fnXX',
                             choices=utils.get_available_activation_fns(),
                             help='activation function to use')
         parser.add_argument('--dropout', type=float, metavar='D',

Mutant 1517

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -51,7 +51,7 @@
         # fmt: off
         parser.add_argument('--activation-fn',
                             choices=utils.get_available_activation_fns(),
-                            help='activation function to use')
+                            help='XXactivation function to useXX')
         parser.add_argument('--dropout', type=float, metavar='D',
                             help='dropout probability')
         parser.add_argument('--attention-dropout', type=float, metavar='D',

Mutant 1518

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -52,7 +52,7 @@
         parser.add_argument('--activation-fn',
                             choices=utils.get_available_activation_fns(),
                             help='activation function to use')
-        parser.add_argument('--dropout', type=float, metavar='D',
+        parser.add_argument('XX--dropoutXX', type=float, metavar='D',
                             help='dropout probability')
         parser.add_argument('--attention-dropout', type=float, metavar='D',
                             help='dropout probability for attention weights')

Mutant 1519

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -52,7 +52,7 @@
         parser.add_argument('--activation-fn',
                             choices=utils.get_available_activation_fns(),
                             help='activation function to use')
-        parser.add_argument('--dropout', type=float, metavar='D',
+        parser.add_argument('--dropout', type=float, metavar='XXDXX',
                             help='dropout probability')
         parser.add_argument('--attention-dropout', type=float, metavar='D',
                             help='dropout probability for attention weights')

Mutant 1520

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -53,7 +53,7 @@
                             choices=utils.get_available_activation_fns(),
                             help='activation function to use')
         parser.add_argument('--dropout', type=float, metavar='D',
-                            help='dropout probability')
+                            help='XXdropout probabilityXX')
         parser.add_argument('--attention-dropout', type=float, metavar='D',
                             help='dropout probability for attention weights')
         parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',

Mutant 1521

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -54,7 +54,7 @@
                             help='activation function to use')
         parser.add_argument('--dropout', type=float, metavar='D',
                             help='dropout probability')
-        parser.add_argument('--attention-dropout', type=float, metavar='D',
+        parser.add_argument('XX--attention-dropoutXX', type=float, metavar='D',
                             help='dropout probability for attention weights')
         parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
                             help='dropout probability after activation in FFN.')

Mutant 1522

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -54,7 +54,7 @@
                             help='activation function to use')
         parser.add_argument('--dropout', type=float, metavar='D',
                             help='dropout probability')
-        parser.add_argument('--attention-dropout', type=float, metavar='D',
+        parser.add_argument('--attention-dropout', type=float, metavar='XXDXX',
                             help='dropout probability for attention weights')
         parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
                             help='dropout probability after activation in FFN.')

Mutant 1523

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -55,7 +55,7 @@
         parser.add_argument('--dropout', type=float, metavar='D',
                             help='dropout probability')
         parser.add_argument('--attention-dropout', type=float, metavar='D',
-                            help='dropout probability for attention weights')
+                            help='XXdropout probability for attention weightsXX')
         parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
                             help='dropout probability after activation in FFN.')
         parser.add_argument('--decoder-embed-dim', type=int, metavar='N',

Mutant 1526

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -56,7 +56,7 @@
                             help='dropout probability')
         parser.add_argument('--attention-dropout', type=float, metavar='D',
                             help='dropout probability for attention weights')
-        parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
+        parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='XXDXX',
                             help='dropout probability after activation in FFN.')
         parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
                             help='decoder embedding dimension')

Mutant 1527

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -57,7 +57,7 @@
         parser.add_argument('--attention-dropout', type=float, metavar='D',
                             help='dropout probability for attention weights')
         parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
-                            help='dropout probability after activation in FFN.')
+                            help='XXdropout probability after activation in FFN.XX')
         parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
                             help='decoder embedding dimension')
         parser.add_argument('--decoder-output-dim', type=int, metavar='N',

Mutant 1528

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -58,7 +58,7 @@
                             help='dropout probability for attention weights')
         parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
                             help='dropout probability after activation in FFN.')
-        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
+        parser.add_argument('XX--decoder-embed-dimXX', type=int, metavar='N',
                             help='decoder embedding dimension')
         parser.add_argument('--decoder-output-dim', type=int, metavar='N',
                             help='decoder output dimension')

Mutant 1529

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -58,7 +58,7 @@
                             help='dropout probability for attention weights')
         parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
                             help='dropout probability after activation in FFN.')
-        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
+        parser.add_argument('--decoder-embed-dim', type=int, metavar='XXNXX',
                             help='decoder embedding dimension')
         parser.add_argument('--decoder-output-dim', type=int, metavar='N',
                             help='decoder output dimension')

Mutant 1530

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -59,7 +59,7 @@
         parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
                             help='dropout probability after activation in FFN.')
         parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
-                            help='decoder embedding dimension')
+                            help='XXdecoder embedding dimensionXX')
         parser.add_argument('--decoder-output-dim', type=int, metavar='N',
                             help='decoder output dimension')
         parser.add_argument('--decoder-input-dim', type=int, metavar='N',

Mutant 1531

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -60,7 +60,7 @@
                             help='dropout probability after activation in FFN.')
         parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
                             help='decoder embedding dimension')
-        parser.add_argument('--decoder-output-dim', type=int, metavar='N',
+        parser.add_argument('XX--decoder-output-dimXX', type=int, metavar='N',
                             help='decoder output dimension')
         parser.add_argument('--decoder-input-dim', type=int, metavar='N',
                             help='decoder input dimension')

Mutant 1532

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -60,7 +60,7 @@
                             help='dropout probability after activation in FFN.')
         parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
                             help='decoder embedding dimension')
-        parser.add_argument('--decoder-output-dim', type=int, metavar='N',
+        parser.add_argument('--decoder-output-dim', type=int, metavar='XXNXX',
                             help='decoder output dimension')
         parser.add_argument('--decoder-input-dim', type=int, metavar='N',
                             help='decoder input dimension')

Mutant 1533

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -61,7 +61,7 @@
         parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
                             help='decoder embedding dimension')
         parser.add_argument('--decoder-output-dim', type=int, metavar='N',
-                            help='decoder output dimension')
+                            help='XXdecoder output dimensionXX')
         parser.add_argument('--decoder-input-dim', type=int, metavar='N',
                             help='decoder input dimension')
         parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',

Mutant 1534

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -62,7 +62,7 @@
                             help='decoder embedding dimension')
         parser.add_argument('--decoder-output-dim', type=int, metavar='N',
                             help='decoder output dimension')
-        parser.add_argument('--decoder-input-dim', type=int, metavar='N',
+        parser.add_argument('XX--decoder-input-dimXX', type=int, metavar='N',
                             help='decoder input dimension')
         parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
                             help='decoder embedding dimension for FFN')

Mutant 1535

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -62,7 +62,7 @@
                             help='decoder embedding dimension')
         parser.add_argument('--decoder-output-dim', type=int, metavar='N',
                             help='decoder output dimension')
-        parser.add_argument('--decoder-input-dim', type=int, metavar='N',
+        parser.add_argument('--decoder-input-dim', type=int, metavar='XXNXX',
                             help='decoder input dimension')
         parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
                             help='decoder embedding dimension for FFN')

Mutant 1536

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -63,7 +63,7 @@
         parser.add_argument('--decoder-output-dim', type=int, metavar='N',
                             help='decoder output dimension')
         parser.add_argument('--decoder-input-dim', type=int, metavar='N',
-                            help='decoder input dimension')
+                            help='XXdecoder input dimensionXX')
         parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
                             help='decoder embedding dimension for FFN')
         parser.add_argument('--decoder-layers', type=int, metavar='N',

Mutant 1537

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -64,7 +64,7 @@
                             help='decoder output dimension')
         parser.add_argument('--decoder-input-dim', type=int, metavar='N',
                             help='decoder input dimension')
-        parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
+        parser.add_argument('XX--decoder-ffn-embed-dimXX', type=int, metavar='N',
                             help='decoder embedding dimension for FFN')
         parser.add_argument('--decoder-layers', type=int, metavar='N',
                             help='num decoder layers')

Mutant 1538

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -64,7 +64,7 @@
                             help='decoder output dimension')
         parser.add_argument('--decoder-input-dim', type=int, metavar='N',
                             help='decoder input dimension')
-        parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
+        parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='XXNXX',
                             help='decoder embedding dimension for FFN')
         parser.add_argument('--decoder-layers', type=int, metavar='N',
                             help='num decoder layers')

Mutant 1539

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -65,7 +65,7 @@
         parser.add_argument('--decoder-input-dim', type=int, metavar='N',
                             help='decoder input dimension')
         parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
-                            help='decoder embedding dimension for FFN')
+                            help='XXdecoder embedding dimension for FFNXX')
         parser.add_argument('--decoder-layers', type=int, metavar='N',
                             help='num decoder layers')
         parser.add_argument('--decoder-attention-heads', type=int, metavar='N',

Mutant 1540

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -66,7 +66,7 @@
                             help='decoder input dimension')
         parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
                             help='decoder embedding dimension for FFN')
-        parser.add_argument('--decoder-layers', type=int, metavar='N',
+        parser.add_argument('XX--decoder-layersXX', type=int, metavar='N',
                             help='num decoder layers')
         parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
                             help='num decoder attention heads')

Mutant 1541

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -66,7 +66,7 @@
                             help='decoder input dimension')
         parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
                             help='decoder embedding dimension for FFN')
-        parser.add_argument('--decoder-layers', type=int, metavar='N',
+        parser.add_argument('--decoder-layers', type=int, metavar='XXNXX',
                             help='num decoder layers')
         parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
                             help='num decoder attention heads')

Mutant 1542

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -67,7 +67,7 @@
         parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
                             help='decoder embedding dimension for FFN')
         parser.add_argument('--decoder-layers', type=int, metavar='N',
-                            help='num decoder layers')
+                            help='XXnum decoder layersXX')
         parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
                             help='num decoder attention heads')
         parser.add_argument('--decoder-normalize-before', action='store_true',

Mutant 1543

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -68,7 +68,7 @@
                             help='decoder embedding dimension for FFN')
         parser.add_argument('--decoder-layers', type=int, metavar='N',
                             help='num decoder layers')
-        parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
+        parser.add_argument('XX--decoder-attention-headsXX', type=int, metavar='N',
                             help='num decoder attention heads')
         parser.add_argument('--decoder-normalize-before', action='store_true',
                             help='apply layernorm before each decoder block')

Mutant 1544

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -68,7 +68,7 @@
                             help='decoder embedding dimension for FFN')
         parser.add_argument('--decoder-layers', type=int, metavar='N',
                             help='num decoder layers')
-        parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
+        parser.add_argument('--decoder-attention-heads', type=int, metavar='XXNXX',
                             help='num decoder attention heads')
         parser.add_argument('--decoder-normalize-before', action='store_true',
                             help='apply layernorm before each decoder block')

Mutant 1545

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -69,7 +69,7 @@
         parser.add_argument('--decoder-layers', type=int, metavar='N',
                             help='num decoder layers')
         parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
-                            help='num decoder attention heads')
+                            help='XXnum decoder attention headsXX')
         parser.add_argument('--decoder-normalize-before', action='store_true',
                             help='apply layernorm before each decoder block')
         parser.add_argument('--no-decoder-final-norm', action='store_true',

Mutant 1546

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -70,7 +70,7 @@
                             help='num decoder layers')
         parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
                             help='num decoder attention heads')
-        parser.add_argument('--decoder-normalize-before', action='store_true',
+        parser.add_argument('XX--decoder-normalize-beforeXX', action='store_true',
                             help='apply layernorm before each decoder block')
         parser.add_argument('--no-decoder-final-norm', action='store_true',
                             help='don\'t add an extra layernorm after the last decoder block')

Mutant 1548

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -71,7 +71,7 @@
         parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
                             help='num decoder attention heads')
         parser.add_argument('--decoder-normalize-before', action='store_true',
-                            help='apply layernorm before each decoder block')
+                            help='XXapply layernorm before each decoder blockXX')
         parser.add_argument('--no-decoder-final-norm', action='store_true',
                             help='don\'t add an extra layernorm after the last decoder block')
         parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',

Mutant 1549

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -72,7 +72,7 @@
                             help='num decoder attention heads')
         parser.add_argument('--decoder-normalize-before', action='store_true',
                             help='apply layernorm before each decoder block')
-        parser.add_argument('--no-decoder-final-norm', action='store_true',
+        parser.add_argument('XX--no-decoder-final-normXX', action='store_true',
                             help='don\'t add an extra layernorm after the last decoder block')
         parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
                             help='comma separated list of adaptive softmax cutoff points. '

Mutant 1551

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -73,7 +73,7 @@
         parser.add_argument('--decoder-normalize-before', action='store_true',
                             help='apply layernorm before each decoder block')
         parser.add_argument('--no-decoder-final-norm', action='store_true',
-                            help='don\'t add an extra layernorm after the last decoder block')
+                            help='XXdon\'t add an extra layernorm after the last decoder blockXX')
         parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
                             help='comma separated list of adaptive softmax cutoff points. '
                                  'Must be used with adaptive_loss criterion')

Mutant 1552

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -74,7 +74,7 @@
                             help='apply layernorm before each decoder block')
         parser.add_argument('--no-decoder-final-norm', action='store_true',
                             help='don\'t add an extra layernorm after the last decoder block')
-        parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
+        parser.add_argument('XX--adaptive-softmax-cutoffXX', metavar='EXPR',
                             help='comma separated list of adaptive softmax cutoff points. '
                                  'Must be used with adaptive_loss criterion')
         parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',

Mutant 1553

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -74,7 +74,7 @@
                             help='apply layernorm before each decoder block')
         parser.add_argument('--no-decoder-final-norm', action='store_true',
                             help='don\'t add an extra layernorm after the last decoder block')
-        parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
+        parser.add_argument('--adaptive-softmax-cutoff', metavar='XXEXPRXX',
                             help='comma separated list of adaptive softmax cutoff points. '
                                  'Must be used with adaptive_loss criterion')
         parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',

Mutant 1554

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -75,7 +75,7 @@
         parser.add_argument('--no-decoder-final-norm', action='store_true',
                             help='don\'t add an extra layernorm after the last decoder block')
         parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
-                            help='comma separated list of adaptive softmax cutoff points. '
+                            help='XXcomma separated list of adaptive softmax cutoff points. XX'
                                  'Must be used with adaptive_loss criterion')
         parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
                             help='sets adaptive softmax dropout for the tail projections')

Mutant 1555

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -77,7 +77,7 @@
         parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
                             help='comma separated list of adaptive softmax cutoff points. '
                                  'Must be used with adaptive_loss criterion')
-        parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
+        parser.add_argument('XX--adaptive-softmax-dropoutXX', type=float, metavar='D',
                             help='sets adaptive softmax dropout for the tail projections')
         parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N',
                             help='adaptive input factor')

Mutant 1556

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -77,7 +77,7 @@
         parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
                             help='comma separated list of adaptive softmax cutoff points. '
                                  'Must be used with adaptive_loss criterion')
-        parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
+        parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='XXDXX',
                             help='sets adaptive softmax dropout for the tail projections')
         parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N',
                             help='adaptive input factor')

Mutant 1557

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -78,7 +78,7 @@
                             help='comma separated list of adaptive softmax cutoff points. '
                                  'Must be used with adaptive_loss criterion')
         parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
-                            help='sets adaptive softmax dropout for the tail projections')
+                            help='XXsets adaptive softmax dropout for the tail projectionsXX')
         parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N',
                             help='adaptive input factor')
         parser.add_argument('--no-token-positional-embeddings', action='store_true',

Mutant 1558

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -79,7 +79,7 @@
                                  'Must be used with adaptive_loss criterion')
         parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
                             help='sets adaptive softmax dropout for the tail projections')
-        parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N',
+        parser.add_argument('XX--adaptive-softmax-factorXX', type=float, metavar='N',
                             help='adaptive input factor')
         parser.add_argument('--no-token-positional-embeddings', action='store_true',
                             help='if set, disables positional embeddings (outside self attention)')

Mutant 1559

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -79,7 +79,7 @@
                                  'Must be used with adaptive_loss criterion')
         parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
                             help='sets adaptive softmax dropout for the tail projections')
-        parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N',
+        parser.add_argument('--adaptive-softmax-factor', type=float, metavar='XXNXX',
                             help='adaptive input factor')
         parser.add_argument('--no-token-positional-embeddings', action='store_true',
                             help='if set, disables positional embeddings (outside self attention)')

Mutant 1560

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -80,7 +80,7 @@
         parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
                             help='sets adaptive softmax dropout for the tail projections')
         parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N',
-                            help='adaptive input factor')
+                            help='XXadaptive input factorXX')
         parser.add_argument('--no-token-positional-embeddings', action='store_true',
                             help='if set, disables positional embeddings (outside self attention)')
         parser.add_argument('--share-decoder-input-output-embed', action='store_true',

Mutant 1561

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -81,7 +81,7 @@
                             help='sets adaptive softmax dropout for the tail projections')
         parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N',
                             help='adaptive input factor')
-        parser.add_argument('--no-token-positional-embeddings', action='store_true',
+        parser.add_argument('XX--no-token-positional-embeddingsXX', action='store_true',
                             help='if set, disables positional embeddings (outside self attention)')
         parser.add_argument('--share-decoder-input-output-embed', action='store_true',
                             help='share decoder input and output embeddings')

Mutant 1563

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -82,7 +82,7 @@
         parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N',
                             help='adaptive input factor')
         parser.add_argument('--no-token-positional-embeddings', action='store_true',
-                            help='if set, disables positional embeddings (outside self attention)')
+                            help='XXif set, disables positional embeddings (outside self attention)XX')
         parser.add_argument('--share-decoder-input-output-embed', action='store_true',
                             help='share decoder input and output embeddings')
         parser.add_argument('--character-embeddings', action='store_true',

Mutant 1564

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -83,7 +83,7 @@
                             help='adaptive input factor')
         parser.add_argument('--no-token-positional-embeddings', action='store_true',
                             help='if set, disables positional embeddings (outside self attention)')
-        parser.add_argument('--share-decoder-input-output-embed', action='store_true',
+        parser.add_argument('XX--share-decoder-input-output-embedXX', action='store_true',
                             help='share decoder input and output embeddings')
         parser.add_argument('--character-embeddings', action='store_true',
                             help='if set, uses character embedding convolutions to produce token embeddings')

Mutant 1566

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -84,7 +84,7 @@
         parser.add_argument('--no-token-positional-embeddings', action='store_true',
                             help='if set, disables positional embeddings (outside self attention)')
         parser.add_argument('--share-decoder-input-output-embed', action='store_true',
-                            help='share decoder input and output embeddings')
+                            help='XXshare decoder input and output embeddingsXX')
         parser.add_argument('--character-embeddings', action='store_true',
                             help='if set, uses character embedding convolutions to produce token embeddings')
         parser.add_argument('--character-filters', type=str, metavar='LIST',

Mutant 1567

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -85,7 +85,7 @@
                             help='if set, disables positional embeddings (outside self attention)')
         parser.add_argument('--share-decoder-input-output-embed', action='store_true',
                             help='share decoder input and output embeddings')
-        parser.add_argument('--character-embeddings', action='store_true',
+        parser.add_argument('XX--character-embeddingsXX', action='store_true',
                             help='if set, uses character embedding convolutions to produce token embeddings')
         parser.add_argument('--character-filters', type=str, metavar='LIST',
                             default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',

Mutant 1569

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -86,7 +86,7 @@
         parser.add_argument('--share-decoder-input-output-embed', action='store_true',
                             help='share decoder input and output embeddings')
         parser.add_argument('--character-embeddings', action='store_true',
-                            help='if set, uses character embedding convolutions to produce token embeddings')
+                            help='XXif set, uses character embedding convolutions to produce token embeddingsXX')
         parser.add_argument('--character-filters', type=str, metavar='LIST',
                             default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
                             help='size of character embeddings')

Mutant 1570

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -87,7 +87,7 @@
                             help='share decoder input and output embeddings')
         parser.add_argument('--character-embeddings', action='store_true',
                             help='if set, uses character embedding convolutions to produce token embeddings')
-        parser.add_argument('--character-filters', type=str, metavar='LIST',
+        parser.add_argument('XX--character-filtersXX', type=str, metavar='LIST',
                             default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
                             help='size of character embeddings')
         parser.add_argument('--character-embedding-dim', default=4, type=int, metavar='N',

Mutant 1571

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -87,7 +87,7 @@
                             help='share decoder input and output embeddings')
         parser.add_argument('--character-embeddings', action='store_true',
                             help='if set, uses character embedding convolutions to produce token embeddings')
-        parser.add_argument('--character-filters', type=str, metavar='LIST',
+        parser.add_argument('--character-filters', type=str, metavar='XXLISTXX',
                             default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
                             help='size of character embeddings')
         parser.add_argument('--character-embedding-dim', default=4, type=int, metavar='N',

Mutant 1572

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -88,7 +88,7 @@
         parser.add_argument('--character-embeddings', action='store_true',
                             help='if set, uses character embedding convolutions to produce token embeddings')
         parser.add_argument('--character-filters', type=str, metavar='LIST',
-                            default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
+                            default='XX[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]XX',
                             help='size of character embeddings')
         parser.add_argument('--character-embedding-dim', default=4, type=int, metavar='N',
                             help='size of character embeddings')

Mutant 1573

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -89,7 +89,7 @@
                             help='if set, uses character embedding convolutions to produce token embeddings')
         parser.add_argument('--character-filters', type=str, metavar='LIST',
                             default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
-                            help='size of character embeddings')
+                            help='XXsize of character embeddingsXX')
         parser.add_argument('--character-embedding-dim', default=4, type=int, metavar='N',
                             help='size of character embeddings')
         parser.add_argument('--char-embedder-highway-layers', default=2, type=int, metavar='N',

Mutant 1574

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -90,7 +90,7 @@
         parser.add_argument('--character-filters', type=str, metavar='LIST',
                             default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
                             help='size of character embeddings')
-        parser.add_argument('--character-embedding-dim', default=4, type=int, metavar='N',
+        parser.add_argument('XX--character-embedding-dimXX', default=4, type=int, metavar='N',
                             help='size of character embeddings')
         parser.add_argument('--char-embedder-highway-layers', default=2, type=int, metavar='N',
                             help='number of highway layers for character token embeddder')

Mutant 1575

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -90,7 +90,7 @@
         parser.add_argument('--character-filters', type=str, metavar='LIST',
                             default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
                             help='size of character embeddings')
-        parser.add_argument('--character-embedding-dim', default=4, type=int, metavar='N',
+        parser.add_argument('--character-embedding-dim', default=5, type=int, metavar='N',
                             help='size of character embeddings')
         parser.add_argument('--char-embedder-highway-layers', default=2, type=int, metavar='N',
                             help='number of highway layers for character token embeddder')

Mutant 1576

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -90,7 +90,7 @@
         parser.add_argument('--character-filters', type=str, metavar='LIST',
                             default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
                             help='size of character embeddings')
-        parser.add_argument('--character-embedding-dim', default=4, type=int, metavar='N',
+        parser.add_argument('--character-embedding-dim', default=4, type=int, metavar='XXNXX',
                             help='size of character embeddings')
         parser.add_argument('--char-embedder-highway-layers', default=2, type=int, metavar='N',
                             help='number of highway layers for character token embeddder')

Mutant 1577

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -91,7 +91,7 @@
                             default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]',
                             help='size of character embeddings')
         parser.add_argument('--character-embedding-dim', default=4, type=int, metavar='N',
-                            help='size of character embeddings')
+                            help='XXsize of character embeddingsXX')
         parser.add_argument('--char-embedder-highway-layers', default=2, type=int, metavar='N',
                             help='number of highway layers for character token embeddder')
         parser.add_argument('--adaptive-input', action='store_true',

Mutant 1578

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -92,7 +92,7 @@
                             help='size of character embeddings')
         parser.add_argument('--character-embedding-dim', default=4, type=int, metavar='N',
                             help='size of character embeddings')
-        parser.add_argument('--char-embedder-highway-layers', default=2, type=int, metavar='N',
+        parser.add_argument('XX--char-embedder-highway-layersXX', default=2, type=int, metavar='N',
                             help='number of highway layers for character token embeddder')
         parser.add_argument('--adaptive-input', action='store_true',
                             help='if set, uses adaptive input')

Mutant 1579

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -92,7 +92,7 @@
                             help='size of character embeddings')
         parser.add_argument('--character-embedding-dim', default=4, type=int, metavar='N',
                             help='size of character embeddings')
-        parser.add_argument('--char-embedder-highway-layers', default=2, type=int, metavar='N',
+        parser.add_argument('--char-embedder-highway-layers', default=3, type=int, metavar='N',
                             help='number of highway layers for character token embeddder')
         parser.add_argument('--adaptive-input', action='store_true',
                             help='if set, uses adaptive input')

Mutant 1580

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -92,7 +92,7 @@
                             help='size of character embeddings')
         parser.add_argument('--character-embedding-dim', default=4, type=int, metavar='N',
                             help='size of character embeddings')
-        parser.add_argument('--char-embedder-highway-layers', default=2, type=int, metavar='N',
+        parser.add_argument('--char-embedder-highway-layers', default=2, type=int, metavar='XXNXX',
                             help='number of highway layers for character token embeddder')
         parser.add_argument('--adaptive-input', action='store_true',
                             help='if set, uses adaptive input')

Mutant 1581

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -93,7 +93,7 @@
         parser.add_argument('--character-embedding-dim', default=4, type=int, metavar='N',
                             help='size of character embeddings')
         parser.add_argument('--char-embedder-highway-layers', default=2, type=int, metavar='N',
-                            help='number of highway layers for character token embeddder')
+                            help='XXnumber of highway layers for character token embeddderXX')
         parser.add_argument('--adaptive-input', action='store_true',
                             help='if set, uses adaptive input')
         parser.add_argument('--adaptive-input-factor', type=float, metavar='N',

Mutant 1582

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -94,7 +94,7 @@
                             help='size of character embeddings')
         parser.add_argument('--char-embedder-highway-layers', default=2, type=int, metavar='N',
                             help='number of highway layers for character token embeddder')
-        parser.add_argument('--adaptive-input', action='store_true',
+        parser.add_argument('XX--adaptive-inputXX', action='store_true',
                             help='if set, uses adaptive input')
         parser.add_argument('--adaptive-input-factor', type=float, metavar='N',
                             help='adaptive input factor')

Mutant 1584

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -95,7 +95,7 @@
         parser.add_argument('--char-embedder-highway-layers', default=2, type=int, metavar='N',
                             help='number of highway layers for character token embeddder')
         parser.add_argument('--adaptive-input', action='store_true',
-                            help='if set, uses adaptive input')
+                            help='XXif set, uses adaptive inputXX')
         parser.add_argument('--adaptive-input-factor', type=float, metavar='N',
                             help='adaptive input factor')
         parser.add_argument('--adaptive-input-cutoff', metavar='EXPR',

Mutant 1585

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -96,7 +96,7 @@
                             help='number of highway layers for character token embeddder')
         parser.add_argument('--adaptive-input', action='store_true',
                             help='if set, uses adaptive input')
-        parser.add_argument('--adaptive-input-factor', type=float, metavar='N',
+        parser.add_argument('XX--adaptive-input-factorXX', type=float, metavar='N',
                             help='adaptive input factor')
         parser.add_argument('--adaptive-input-cutoff', metavar='EXPR',
                             help='comma separated list of adaptive input cutoff points.')

Mutant 1586

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -96,7 +96,7 @@
                             help='number of highway layers for character token embeddder')
         parser.add_argument('--adaptive-input', action='store_true',
                             help='if set, uses adaptive input')
-        parser.add_argument('--adaptive-input-factor', type=float, metavar='N',
+        parser.add_argument('--adaptive-input-factor', type=float, metavar='XXNXX',
                             help='adaptive input factor')
         parser.add_argument('--adaptive-input-cutoff', metavar='EXPR',
                             help='comma separated list of adaptive input cutoff points.')

Mutant 1587

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -97,7 +97,7 @@
         parser.add_argument('--adaptive-input', action='store_true',
                             help='if set, uses adaptive input')
         parser.add_argument('--adaptive-input-factor', type=float, metavar='N',
-                            help='adaptive input factor')
+                            help='XXadaptive input factorXX')
         parser.add_argument('--adaptive-input-cutoff', metavar='EXPR',
                             help='comma separated list of adaptive input cutoff points.')
         parser.add_argument('--tie-adaptive-weights', action='store_true',

Mutant 1588

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -98,7 +98,7 @@
                             help='if set, uses adaptive input')
         parser.add_argument('--adaptive-input-factor', type=float, metavar='N',
                             help='adaptive input factor')
-        parser.add_argument('--adaptive-input-cutoff', metavar='EXPR',
+        parser.add_argument('XX--adaptive-input-cutoffXX', metavar='EXPR',
                             help='comma separated list of adaptive input cutoff points.')
         parser.add_argument('--tie-adaptive-weights', action='store_true',
                             help='if set, ties the weights of adaptive softmax and adaptive input')

Mutant 1589

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -98,7 +98,7 @@
                             help='if set, uses adaptive input')
         parser.add_argument('--adaptive-input-factor', type=float, metavar='N',
                             help='adaptive input factor')
-        parser.add_argument('--adaptive-input-cutoff', metavar='EXPR',
+        parser.add_argument('--adaptive-input-cutoff', metavar='XXEXPRXX',
                             help='comma separated list of adaptive input cutoff points.')
         parser.add_argument('--tie-adaptive-weights', action='store_true',
                             help='if set, ties the weights of adaptive softmax and adaptive input')

Mutant 1590

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -99,7 +99,7 @@
         parser.add_argument('--adaptive-input-factor', type=float, metavar='N',
                             help='adaptive input factor')
         parser.add_argument('--adaptive-input-cutoff', metavar='EXPR',
-                            help='comma separated list of adaptive input cutoff points.')
+                            help='XXcomma separated list of adaptive input cutoff points.XX')
         parser.add_argument('--tie-adaptive-weights', action='store_true',
                             help='if set, ties the weights of adaptive softmax and adaptive input')
         parser.add_argument('--tie-adaptive-proj', action='store_true',

Mutant 1591

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -100,7 +100,7 @@
                             help='adaptive input factor')
         parser.add_argument('--adaptive-input-cutoff', metavar='EXPR',
                             help='comma separated list of adaptive input cutoff points.')
-        parser.add_argument('--tie-adaptive-weights', action='store_true',
+        parser.add_argument('XX--tie-adaptive-weightsXX', action='store_true',
                             help='if set, ties the weights of adaptive softmax and adaptive input')
         parser.add_argument('--tie-adaptive-proj', action='store_true',
                             help='if set, ties the projection weights of adaptive softmax and adaptive input')

Mutant 1593

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -101,7 +101,7 @@
         parser.add_argument('--adaptive-input-cutoff', metavar='EXPR',
                             help='comma separated list of adaptive input cutoff points.')
         parser.add_argument('--tie-adaptive-weights', action='store_true',
-                            help='if set, ties the weights of adaptive softmax and adaptive input')
+                            help='XXif set, ties the weights of adaptive softmax and adaptive inputXX')
         parser.add_argument('--tie-adaptive-proj', action='store_true',
                             help='if set, ties the projection weights of adaptive softmax and adaptive input')
         parser.add_argument('--decoder-learned-pos', action='store_true',

Mutant 1594

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -102,7 +102,7 @@
                             help='comma separated list of adaptive input cutoff points.')
         parser.add_argument('--tie-adaptive-weights', action='store_true',
                             help='if set, ties the weights of adaptive softmax and adaptive input')
-        parser.add_argument('--tie-adaptive-proj', action='store_true',
+        parser.add_argument('XX--tie-adaptive-projXX', action='store_true',
                             help='if set, ties the projection weights of adaptive softmax and adaptive input')
         parser.add_argument('--decoder-learned-pos', action='store_true',
                             help='use learned positional embeddings in the decoder')

Mutant 1596

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -103,7 +103,7 @@
         parser.add_argument('--tie-adaptive-weights', action='store_true',
                             help='if set, ties the weights of adaptive softmax and adaptive input')
         parser.add_argument('--tie-adaptive-proj', action='store_true',
-                            help='if set, ties the projection weights of adaptive softmax and adaptive input')
+                            help='XXif set, ties the projection weights of adaptive softmax and adaptive inputXX')
         parser.add_argument('--decoder-learned-pos', action='store_true',
                             help='use learned positional embeddings in the decoder')
         parser.add_argument('--layernorm-embedding', action='store_true',

Mutant 1597

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -104,7 +104,7 @@
                             help='if set, ties the weights of adaptive softmax and adaptive input')
         parser.add_argument('--tie-adaptive-proj', action='store_true',
                             help='if set, ties the projection weights of adaptive softmax and adaptive input')
-        parser.add_argument('--decoder-learned-pos', action='store_true',
+        parser.add_argument('XX--decoder-learned-posXX', action='store_true',
                             help='use learned positional embeddings in the decoder')
         parser.add_argument('--layernorm-embedding', action='store_true',
                             help='add layernorm to embedding')

Mutant 1599

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -105,7 +105,7 @@
         parser.add_argument('--tie-adaptive-proj', action='store_true',
                             help='if set, ties the projection weights of adaptive softmax and adaptive input')
         parser.add_argument('--decoder-learned-pos', action='store_true',
-                            help='use learned positional embeddings in the decoder')
+                            help='XXuse learned positional embeddings in the decoderXX')
         parser.add_argument('--layernorm-embedding', action='store_true',
                             help='add layernorm to embedding')
         parser.add_argument('--no-scale-embedding', action='store_true',

Mutant 1600

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -106,7 +106,7 @@
                             help='if set, ties the projection weights of adaptive softmax and adaptive input')
         parser.add_argument('--decoder-learned-pos', action='store_true',
                             help='use learned positional embeddings in the decoder')
-        parser.add_argument('--layernorm-embedding', action='store_true',
+        parser.add_argument('XX--layernorm-embeddingXX', action='store_true',
                             help='add layernorm to embedding')
         parser.add_argument('--no-scale-embedding', action='store_true',
                             help='if True, dont scale embeddings')

Mutant 1602

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -107,7 +107,7 @@
         parser.add_argument('--decoder-learned-pos', action='store_true',
                             help='use learned positional embeddings in the decoder')
         parser.add_argument('--layernorm-embedding', action='store_true',
-                            help='add layernorm to embedding')
+                            help='XXadd layernorm to embeddingXX')
         parser.add_argument('--no-scale-embedding', action='store_true',
                             help='if True, dont scale embeddings')
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)

Mutant 1603

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -108,7 +108,7 @@
                             help='use learned positional embeddings in the decoder')
         parser.add_argument('--layernorm-embedding', action='store_true',
                             help='add layernorm to embedding')
-        parser.add_argument('--no-scale-embedding', action='store_true',
+        parser.add_argument('XX--no-scale-embeddingXX', action='store_true',
                             help='if True, dont scale embeddings')
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
         parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,

Mutant 1605

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -109,7 +109,7 @@
         parser.add_argument('--layernorm-embedding', action='store_true',
                             help='add layernorm to embedding')
         parser.add_argument('--no-scale-embedding', action='store_true',
-                            help='if True, dont scale embeddings')
+                            help='XXif True, dont scale embeddingsXX')
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
         parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
                             help='LayerDrop probability for decoder')

Mutant 1606

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -111,7 +111,7 @@
         parser.add_argument('--no-scale-embedding', action='store_true',
                             help='if True, dont scale embeddings')
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
-        parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
+        parser.add_argument('XX--decoder-layerdropXX', type=float, metavar='D', default=0,
                             help='LayerDrop probability for decoder')
         parser.add_argument('--decoder-layers-to-keep', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')

Mutant 1607

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -111,7 +111,7 @@
         parser.add_argument('--no-scale-embedding', action='store_true',
                             help='if True, dont scale embeddings')
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
-        parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
+        parser.add_argument('--decoder-layerdrop', type=float, metavar='XXDXX', default=0,
                             help='LayerDrop probability for decoder')
         parser.add_argument('--decoder-layers-to-keep', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')

Mutant 1608

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -111,7 +111,7 @@
         parser.add_argument('--no-scale-embedding', action='store_true',
                             help='if True, dont scale embeddings')
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
-        parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
+        parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=1,
                             help='LayerDrop probability for decoder')
         parser.add_argument('--decoder-layers-to-keep', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')

Mutant 1609

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -112,7 +112,7 @@
                             help='if True, dont scale embeddings')
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
         parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
-                            help='LayerDrop probability for decoder')
+                            help='XXLayerDrop probability for decoderXX')
         parser.add_argument('--decoder-layers-to-keep', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)

Mutant 1610

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -113,7 +113,7 @@
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
         parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
                             help='LayerDrop probability for decoder')
-        parser.add_argument('--decoder-layers-to-keep', default=None,
+        parser.add_argument('XX--decoder-layers-to-keepXX', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
         parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,

Mutant 1611

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -114,7 +114,7 @@
         parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
                             help='LayerDrop probability for decoder')
         parser.add_argument('--decoder-layers-to-keep', default=None,
-                            help='which layers to *keep* when pruning as a comma-separated list')
+                            help='XXwhich layers to *keep* when pruning as a comma-separated listXX')
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
         parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
                             help='iterative PQ quantization noise at training time')

Mutant 1612

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -116,7 +116,7 @@
         parser.add_argument('--decoder-layers-to-keep', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
-        parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
+        parser.add_argument('XX--quant-noise-pqXX', type=float, metavar='D', default=0,
                             help='iterative PQ quantization noise at training time')
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')

Mutant 1613

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -116,7 +116,7 @@
         parser.add_argument('--decoder-layers-to-keep', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
-        parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
+        parser.add_argument('--quant-noise-pq', type=float, metavar='XXDXX', default=0,
                             help='iterative PQ quantization noise at training time')
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')

Mutant 1614

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -116,7 +116,7 @@
         parser.add_argument('--decoder-layers-to-keep', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
-        parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
+        parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=1,
                             help='iterative PQ quantization noise at training time')
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')

Mutant 1615

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -117,7 +117,7 @@
                             help='which layers to *keep* when pruning as a comma-separated list')
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
         parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
-                            help='iterative PQ quantization noise at training time')
+                            help='XXiterative PQ quantization noise at training timeXX')
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')
         parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,

Mutant 1616

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -118,7 +118,7 @@
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
         parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
                             help='iterative PQ quantization noise at training time')
-        parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
+        parser.add_argument('XX--quant-noise-pq-block-sizeXX', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')
         parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
                             help='scalar quantization noise and scalar quantization at training time')

Mutant 1617

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -118,7 +118,7 @@
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
         parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
                             help='iterative PQ quantization noise at training time')
-        parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
+        parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='XXDXX', default=8,
                             help='block size of quantization noise at training time')
         parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
                             help='scalar quantization noise and scalar quantization at training time')

Mutant 1618

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -118,7 +118,7 @@
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
         parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
                             help='iterative PQ quantization noise at training time')
-        parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
+        parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=9,
                             help='block size of quantization noise at training time')
         parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
                             help='scalar quantization noise and scalar quantization at training time')

Mutant 1619

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -119,7 +119,7 @@
         parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
                             help='iterative PQ quantization noise at training time')
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
-                            help='block size of quantization noise at training time')
+                            help='XXblock size of quantization noise at training timeXX')
         parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
                             help='scalar quantization noise and scalar quantization at training time')
         # fmt: on

Mutant 1620

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -120,7 +120,7 @@
                             help='iterative PQ quantization noise at training time')
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')
-        parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
+        parser.add_argument('XX--quant-noise-scalarXX', type=float, metavar='D', default=0,
                             help='scalar quantization noise and scalar quantization at training time')
         # fmt: on
 

Mutant 1621

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -120,7 +120,7 @@
                             help='iterative PQ quantization noise at training time')
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')
-        parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
+        parser.add_argument('--quant-noise-scalar', type=float, metavar='XXDXX', default=0,
                             help='scalar quantization noise and scalar quantization at training time')
         # fmt: on
 

Mutant 1622

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -120,7 +120,7 @@
                             help='iterative PQ quantization noise at training time')
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')
-        parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
+        parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=1,
                             help='scalar quantization noise and scalar quantization at training time')
         # fmt: on
 

Mutant 1623

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -121,7 +121,7 @@
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')
         parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
-                            help='scalar quantization noise and scalar quantization at training time')
+                            help='XXscalar quantization noise and scalar quantization at training timeXX')
         # fmt: on
 
     @classmethod

Mutant 1624

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -122,9 +122,7 @@
                             help='block size of quantization noise at training time')
         parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
                             help='scalar quantization noise and scalar quantization at training time')
-        # fmt: on
-
-    @classmethod
+
     def build_model(cls, args, task):
         """Build a new model instance."""
 

Mutant 1625

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -165,7 +165,6 @@
         )
         return cls(decoder)
 
-    @classmethod
     def build_embedding(cls, args, dictionary, embed_dim, path=None):
         embed_tokens = Embedding(len(dictionary), embed_dim, dictionary.pad())
         return embed_tokens

Mutant 1627

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -171,7 +171,7 @@
         return embed_tokens
 
 
-@register_model_architecture('transformer_lm', 'transformer_lm')
+@register_model_architecture('transformer_lm', 'XXtransformer_lmXX')
 def base_lm_architecture(args):
     # backward compatibility for older model checkpoints
     if hasattr(args, 'no_tie_adaptive_proj'):

Mutant 1628

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -170,8 +170,6 @@
         embed_tokens = Embedding(len(dictionary), embed_dim, dictionary.pad())
         return embed_tokens
 
-
-@register_model_architecture('transformer_lm', 'transformer_lm')
 def base_lm_architecture(args):
     # backward compatibility for older model checkpoints
     if hasattr(args, 'no_tie_adaptive_proj'):

Mutant 1630

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -219,7 +219,7 @@
     args.layernorm_embedding = getattr(args, 'layernorm_embedding', False)
 
 
-@register_model_architecture('transformer_lm', 'transformer_lm_big')
+@register_model_architecture('transformer_lm', 'XXtransformer_lm_bigXX')
 def transformer_lm_big(args):
     args.decoder_layers = getattr(args, 'decoder_layers', 12)
     args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1024)

Mutant 1631

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -218,8 +218,6 @@
     args.no_scale_embedding = getattr(args, 'no_scale_embedding', False)
     args.layernorm_embedding = getattr(args, 'layernorm_embedding', False)
 
-
-@register_model_architecture('transformer_lm', 'transformer_lm_big')
 def transformer_lm_big(args):
     args.decoder_layers = getattr(args, 'decoder_layers', 12)
     args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1024)

Mutant 1633

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -228,7 +228,7 @@
     base_lm_architecture(args)
 
 
-@register_model_architecture('transformer_lm', 'transformer_lm_wiki103')
+@register_model_architecture('transformer_lm', 'XXtransformer_lm_wiki103XX')
 @register_model_architecture('transformer_lm', 'transformer_lm_baevski_wiki103')
 def transformer_lm_baevski_wiki103(args):
     args.decoder_layers = getattr(args, 'decoder_layers', 16)

Mutant 1634

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -227,8 +227,6 @@
     args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 16)
     base_lm_architecture(args)
 
-
-@register_model_architecture('transformer_lm', 'transformer_lm_wiki103')
 @register_model_architecture('transformer_lm', 'transformer_lm_baevski_wiki103')
 def transformer_lm_baevski_wiki103(args):
     args.decoder_layers = getattr(args, 'decoder_layers', 16)

Mutant 1636

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -229,7 +229,7 @@
 
 
 @register_model_architecture('transformer_lm', 'transformer_lm_wiki103')
-@register_model_architecture('transformer_lm', 'transformer_lm_baevski_wiki103')
+@register_model_architecture('transformer_lm', 'XXtransformer_lm_baevski_wiki103XX')
 def transformer_lm_baevski_wiki103(args):
     args.decoder_layers = getattr(args, 'decoder_layers', 16)
     args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8)

Mutant 1637

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -229,7 +229,7 @@
 
 
 @register_model_architecture('transformer_lm', 'transformer_lm_wiki103')
-@register_model_architecture('transformer_lm', 'transformer_lm_baevski_wiki103')
+
 def transformer_lm_baevski_wiki103(args):
     args.decoder_layers = getattr(args, 'decoder_layers', 16)
     args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8)

Mutant 1639

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -246,7 +246,7 @@
     transformer_lm_big(args)
 
 
-@register_model_architecture('transformer_lm', 'transformer_lm_gbw')
+@register_model_architecture('transformer_lm', 'XXtransformer_lm_gbwXX')
 @register_model_architecture('transformer_lm', 'transformer_lm_baevski_gbw')
 def transformer_lm_baevski_gbw(args):
     args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)

Mutant 1640

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -245,8 +245,6 @@
     args.tie_adaptive_proj = getattr(args, 'tie_adaptive_proj', True)
     transformer_lm_big(args)
 
-
-@register_model_architecture('transformer_lm', 'transformer_lm_gbw')
 @register_model_architecture('transformer_lm', 'transformer_lm_baevski_gbw')
 def transformer_lm_baevski_gbw(args):
     args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)

Mutant 1642

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -247,7 +247,7 @@
 
 
 @register_model_architecture('transformer_lm', 'transformer_lm_gbw')
-@register_model_architecture('transformer_lm', 'transformer_lm_baevski_gbw')
+@register_model_architecture('transformer_lm', 'XXtransformer_lm_baevski_gbwXX')
 def transformer_lm_baevski_gbw(args):
     args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
     args.dropout = getattr(args, 'dropout', 0.1)

Mutant 1643

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -247,7 +247,7 @@
 
 
 @register_model_architecture('transformer_lm', 'transformer_lm_gbw')
-@register_model_architecture('transformer_lm', 'transformer_lm_baevski_gbw')
+
 def transformer_lm_baevski_gbw(args):
     args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
     args.dropout = getattr(args, 'dropout', 0.1)

Mutant 1645

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -256,7 +256,7 @@
     transformer_lm_big(args)
 
 
-@register_model_architecture('transformer_lm', 'transformer_lm_gpt')
+@register_model_architecture('transformer_lm', 'XXtransformer_lm_gptXX')
 def transformer_lm_gpt(args):
     args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 768)
     args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 3072)

Mutant 1646

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -255,8 +255,6 @@
     args.no_decoder_final_norm = getattr(args, 'no_decoder_final_norm', True)
     transformer_lm_big(args)
 
-
-@register_model_architecture('transformer_lm', 'transformer_lm_gpt')
 def transformer_lm_gpt(args):
     args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 768)
     args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 3072)

Mutant 1648

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -268,7 +268,7 @@
     base_lm_architecture(args)
 
 
-@register_model_architecture('transformer_lm', 'transformer_lm_gpt2_small')
+@register_model_architecture('transformer_lm', 'XXtransformer_lm_gpt2_smallXX')
 def transformer_lm_gpt2_small(args):
     args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1024)
     args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 4096)

Mutant 1649

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -267,8 +267,6 @@
     args.activation_fn = getattr(args, 'activation_fn', 'gelu')
     base_lm_architecture(args)
 
-
-@register_model_architecture('transformer_lm', 'transformer_lm_gpt2_small')
 def transformer_lm_gpt2_small(args):
     args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1024)
     args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 4096)

Mutant 1651

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -280,7 +280,7 @@
     base_lm_architecture(args)
 
 
-@register_model_architecture('transformer_lm', 'transformer_lm_gpt2_medium')
+@register_model_architecture('transformer_lm', 'XXtransformer_lm_gpt2_mediumXX')
 def transformer_lm_gpt2_medium(args):
     args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1280)
     args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 5120)

Mutant 1652

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -279,8 +279,6 @@
     args.activation_fn = getattr(args, 'activation_fn', 'gelu')
     base_lm_architecture(args)
 
-
-@register_model_architecture('transformer_lm', 'transformer_lm_gpt2_medium')
 def transformer_lm_gpt2_medium(args):
     args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1280)
     args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 5120)

Mutant 1654

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -292,7 +292,7 @@
     base_lm_architecture(args)
 
 
-@register_model_architecture('transformer_lm', 'transformer_lm_gpt2_big')
+@register_model_architecture('transformer_lm', 'XXtransformer_lm_gpt2_bigXX')
 def transformer_lm_gpt2_big(args):
     args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1600)
     args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 6400)

Mutant 1655

--- fairseq/models/transformer_lm.py
+++ fairseq/models/transformer_lm.py
@@ -291,8 +291,6 @@
     args.activation_fn = getattr(args, 'activation_fn', 'gelu')
     base_lm_architecture(args)
 
-
-@register_model_architecture('transformer_lm', 'transformer_lm_gpt2_big')
 def transformer_lm_gpt2_big(args):
     args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1600)
     args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 6400)