fairseq/models/transformer.py

Killed 22 out of 151 mutants

Survived

Survived mutation testing. These mutants show holes in your test suite.

Mutant 2597

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -31,7 +31,7 @@
 from torch import Tensor
 
 
-DEFAULT_MAX_SOURCE_POSITIONS = 1024
+DEFAULT_MAX_SOURCE_POSITIONS = 1025
 DEFAULT_MAX_TARGET_POSITIONS = 1024
 
 

Mutant 2598

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -31,7 +31,7 @@
 from torch import Tensor
 
 
-DEFAULT_MAX_SOURCE_POSITIONS = 1024
+DEFAULT_MAX_SOURCE_POSITIONS = None
 DEFAULT_MAX_TARGET_POSITIONS = 1024
 
 

Mutant 2599

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -32,7 +32,7 @@
 
 
 DEFAULT_MAX_SOURCE_POSITIONS = 1024
-DEFAULT_MAX_TARGET_POSITIONS = 1024
+DEFAULT_MAX_TARGET_POSITIONS = 1025
 
 
 @register_model("transformer")

Mutant 2600

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -32,7 +32,7 @@
 
 
 DEFAULT_MAX_SOURCE_POSITIONS = 1024
-DEFAULT_MAX_TARGET_POSITIONS = 1024
+DEFAULT_MAX_TARGET_POSITIONS = None
 
 
 @register_model("transformer")

Mutant 2603

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -53,7 +53,6 @@
         :prog:
     """
 
-    @classmethod
     def hub_models(cls):
         # fmt: off
 

Mutant 2604

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -91,7 +91,6 @@
         self.args = args
         self.supports_align_args = True
 
-    @staticmethod
     def add_args(parser):
         """Add model-specific arguments to the parser."""
         # fmt: off

Mutant 2605

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -95,7 +95,7 @@
     def add_args(parser):
         """Add model-specific arguments to the parser."""
         # fmt: off
-        parser.add_argument('--activation-fn',
+        parser.add_argument('XX--activation-fnXX',
                             choices=utils.get_available_activation_fns(),
                             help='activation function to use')
         parser.add_argument('--dropout', type=float, metavar='D',

Mutant 2606

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -97,7 +97,7 @@
         # fmt: off
         parser.add_argument('--activation-fn',
                             choices=utils.get_available_activation_fns(),
-                            help='activation function to use')
+                            help='XXactivation function to useXX')
         parser.add_argument('--dropout', type=float, metavar='D',
                             help='dropout probability')
         parser.add_argument('--attention-dropout', type=float, metavar='D',

Mutant 2607

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -98,7 +98,7 @@
         parser.add_argument('--activation-fn',
                             choices=utils.get_available_activation_fns(),
                             help='activation function to use')
-        parser.add_argument('--dropout', type=float, metavar='D',
+        parser.add_argument('XX--dropoutXX', type=float, metavar='D',
                             help='dropout probability')
         parser.add_argument('--attention-dropout', type=float, metavar='D',
                             help='dropout probability for attention weights')

Mutant 2608

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -98,7 +98,7 @@
         parser.add_argument('--activation-fn',
                             choices=utils.get_available_activation_fns(),
                             help='activation function to use')
-        parser.add_argument('--dropout', type=float, metavar='D',
+        parser.add_argument('--dropout', type=float, metavar='XXDXX',
                             help='dropout probability')
         parser.add_argument('--attention-dropout', type=float, metavar='D',
                             help='dropout probability for attention weights')

Mutant 2609

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -99,7 +99,7 @@
                             choices=utils.get_available_activation_fns(),
                             help='activation function to use')
         parser.add_argument('--dropout', type=float, metavar='D',
-                            help='dropout probability')
+                            help='XXdropout probabilityXX')
         parser.add_argument('--attention-dropout', type=float, metavar='D',
                             help='dropout probability for attention weights')
         parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',

Mutant 2610

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -100,7 +100,7 @@
                             help='activation function to use')
         parser.add_argument('--dropout', type=float, metavar='D',
                             help='dropout probability')
-        parser.add_argument('--attention-dropout', type=float, metavar='D',
+        parser.add_argument('XX--attention-dropoutXX', type=float, metavar='D',
                             help='dropout probability for attention weights')
         parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
                             help='dropout probability after activation in FFN.')

Mutant 2611

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -100,7 +100,7 @@
                             help='activation function to use')
         parser.add_argument('--dropout', type=float, metavar='D',
                             help='dropout probability')
-        parser.add_argument('--attention-dropout', type=float, metavar='D',
+        parser.add_argument('--attention-dropout', type=float, metavar='XXDXX',
                             help='dropout probability for attention weights')
         parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
                             help='dropout probability after activation in FFN.')

Mutant 2612

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -101,7 +101,7 @@
         parser.add_argument('--dropout', type=float, metavar='D',
                             help='dropout probability')
         parser.add_argument('--attention-dropout', type=float, metavar='D',
-                            help='dropout probability for attention weights')
+                            help='XXdropout probability for attention weightsXX')
         parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
                             help='dropout probability after activation in FFN.')
         parser.add_argument('--encoder-embed-path', type=str, metavar='STR',

Mutant 2615

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -102,7 +102,7 @@
                             help='dropout probability')
         parser.add_argument('--attention-dropout', type=float, metavar='D',
                             help='dropout probability for attention weights')
-        parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
+        parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='XXDXX',
                             help='dropout probability after activation in FFN.')
         parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
                             help='path to pre-trained encoder embedding')

Mutant 2616

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -103,7 +103,7 @@
         parser.add_argument('--attention-dropout', type=float, metavar='D',
                             help='dropout probability for attention weights')
         parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
-                            help='dropout probability after activation in FFN.')
+                            help='XXdropout probability after activation in FFN.XX')
         parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
                             help='path to pre-trained encoder embedding')
         parser.add_argument('--encoder-embed-dim', type=int, metavar='N',

Mutant 2617

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -104,7 +104,7 @@
                             help='dropout probability for attention weights')
         parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
                             help='dropout probability after activation in FFN.')
-        parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
+        parser.add_argument('XX--encoder-embed-pathXX', type=str, metavar='STR',
                             help='path to pre-trained encoder embedding')
         parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
                             help='encoder embedding dimension')

Mutant 2618

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -104,7 +104,7 @@
                             help='dropout probability for attention weights')
         parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
                             help='dropout probability after activation in FFN.')
-        parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
+        parser.add_argument('--encoder-embed-path', type=str, metavar='XXSTRXX',
                             help='path to pre-trained encoder embedding')
         parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
                             help='encoder embedding dimension')

Mutant 2619

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -105,7 +105,7 @@
         parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
                             help='dropout probability after activation in FFN.')
         parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
-                            help='path to pre-trained encoder embedding')
+                            help='XXpath to pre-trained encoder embeddingXX')
         parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
                             help='encoder embedding dimension')
         parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',

Mutant 2620

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -106,7 +106,7 @@
                             help='dropout probability after activation in FFN.')
         parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
                             help='path to pre-trained encoder embedding')
-        parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
+        parser.add_argument('XX--encoder-embed-dimXX', type=int, metavar='N',
                             help='encoder embedding dimension')
         parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
                             help='encoder embedding dimension for FFN')

Mutant 2621

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -106,7 +106,7 @@
                             help='dropout probability after activation in FFN.')
         parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
                             help='path to pre-trained encoder embedding')
-        parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
+        parser.add_argument('--encoder-embed-dim', type=int, metavar='XXNXX',
                             help='encoder embedding dimension')
         parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
                             help='encoder embedding dimension for FFN')

Mutant 2622

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -107,7 +107,7 @@
         parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
                             help='path to pre-trained encoder embedding')
         parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
-                            help='encoder embedding dimension')
+                            help='XXencoder embedding dimensionXX')
         parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
                             help='encoder embedding dimension for FFN')
         parser.add_argument('--encoder-layers', type=int, metavar='N',

Mutant 2623

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -108,7 +108,7 @@
                             help='path to pre-trained encoder embedding')
         parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
                             help='encoder embedding dimension')
-        parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
+        parser.add_argument('XX--encoder-ffn-embed-dimXX', type=int, metavar='N',
                             help='encoder embedding dimension for FFN')
         parser.add_argument('--encoder-layers', type=int, metavar='N',
                             help='num encoder layers')

Mutant 2624

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -108,7 +108,7 @@
                             help='path to pre-trained encoder embedding')
         parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
                             help='encoder embedding dimension')
-        parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
+        parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='XXNXX',
                             help='encoder embedding dimension for FFN')
         parser.add_argument('--encoder-layers', type=int, metavar='N',
                             help='num encoder layers')

Mutant 2625

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -109,7 +109,7 @@
         parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
                             help='encoder embedding dimension')
         parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
-                            help='encoder embedding dimension for FFN')
+                            help='XXencoder embedding dimension for FFNXX')
         parser.add_argument('--encoder-layers', type=int, metavar='N',
                             help='num encoder layers')
         parser.add_argument('--encoder-attention-heads', type=int, metavar='N',

Mutant 2626

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -110,7 +110,7 @@
                             help='encoder embedding dimension')
         parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
                             help='encoder embedding dimension for FFN')
-        parser.add_argument('--encoder-layers', type=int, metavar='N',
+        parser.add_argument('XX--encoder-layersXX', type=int, metavar='N',
                             help='num encoder layers')
         parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
                             help='num encoder attention heads')

Mutant 2627

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -110,7 +110,7 @@
                             help='encoder embedding dimension')
         parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
                             help='encoder embedding dimension for FFN')
-        parser.add_argument('--encoder-layers', type=int, metavar='N',
+        parser.add_argument('--encoder-layers', type=int, metavar='XXNXX',
                             help='num encoder layers')
         parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
                             help='num encoder attention heads')

Mutant 2628

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -111,7 +111,7 @@
         parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
                             help='encoder embedding dimension for FFN')
         parser.add_argument('--encoder-layers', type=int, metavar='N',
-                            help='num encoder layers')
+                            help='XXnum encoder layersXX')
         parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
                             help='num encoder attention heads')
         parser.add_argument('--encoder-normalize-before', action='store_true',

Mutant 2629

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -112,7 +112,7 @@
                             help='encoder embedding dimension for FFN')
         parser.add_argument('--encoder-layers', type=int, metavar='N',
                             help='num encoder layers')
-        parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
+        parser.add_argument('XX--encoder-attention-headsXX', type=int, metavar='N',
                             help='num encoder attention heads')
         parser.add_argument('--encoder-normalize-before', action='store_true',
                             help='apply layernorm before each encoder block')

Mutant 2630

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -112,7 +112,7 @@
                             help='encoder embedding dimension for FFN')
         parser.add_argument('--encoder-layers', type=int, metavar='N',
                             help='num encoder layers')
-        parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
+        parser.add_argument('--encoder-attention-heads', type=int, metavar='XXNXX',
                             help='num encoder attention heads')
         parser.add_argument('--encoder-normalize-before', action='store_true',
                             help='apply layernorm before each encoder block')

Mutant 2631

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -113,7 +113,7 @@
         parser.add_argument('--encoder-layers', type=int, metavar='N',
                             help='num encoder layers')
         parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
-                            help='num encoder attention heads')
+                            help='XXnum encoder attention headsXX')
         parser.add_argument('--encoder-normalize-before', action='store_true',
                             help='apply layernorm before each encoder block')
         parser.add_argument('--encoder-learned-pos', action='store_true',

Mutant 2632

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -114,7 +114,7 @@
                             help='num encoder layers')
         parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
                             help='num encoder attention heads')
-        parser.add_argument('--encoder-normalize-before', action='store_true',
+        parser.add_argument('XX--encoder-normalize-beforeXX', action='store_true',
                             help='apply layernorm before each encoder block')
         parser.add_argument('--encoder-learned-pos', action='store_true',
                             help='use learned positional embeddings in the encoder')

Mutant 2634

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -115,7 +115,7 @@
         parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
                             help='num encoder attention heads')
         parser.add_argument('--encoder-normalize-before', action='store_true',
-                            help='apply layernorm before each encoder block')
+                            help='XXapply layernorm before each encoder blockXX')
         parser.add_argument('--encoder-learned-pos', action='store_true',
                             help='use learned positional embeddings in the encoder')
         parser.add_argument('--decoder-embed-path', type=str, metavar='STR',

Mutant 2635

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -116,7 +116,7 @@
                             help='num encoder attention heads')
         parser.add_argument('--encoder-normalize-before', action='store_true',
                             help='apply layernorm before each encoder block')
-        parser.add_argument('--encoder-learned-pos', action='store_true',
+        parser.add_argument('XX--encoder-learned-posXX', action='store_true',
                             help='use learned positional embeddings in the encoder')
         parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
                             help='path to pre-trained decoder embedding')

Mutant 2637

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -117,7 +117,7 @@
         parser.add_argument('--encoder-normalize-before', action='store_true',
                             help='apply layernorm before each encoder block')
         parser.add_argument('--encoder-learned-pos', action='store_true',
-                            help='use learned positional embeddings in the encoder')
+                            help='XXuse learned positional embeddings in the encoderXX')
         parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
                             help='path to pre-trained decoder embedding')
         parser.add_argument('--decoder-embed-dim', type=int, metavar='N',

Mutant 2638

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -118,7 +118,7 @@
                             help='apply layernorm before each encoder block')
         parser.add_argument('--encoder-learned-pos', action='store_true',
                             help='use learned positional embeddings in the encoder')
-        parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
+        parser.add_argument('XX--decoder-embed-pathXX', type=str, metavar='STR',
                             help='path to pre-trained decoder embedding')
         parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
                             help='decoder embedding dimension')

Mutant 2639

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -118,7 +118,7 @@
                             help='apply layernorm before each encoder block')
         parser.add_argument('--encoder-learned-pos', action='store_true',
                             help='use learned positional embeddings in the encoder')
-        parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
+        parser.add_argument('--decoder-embed-path', type=str, metavar='XXSTRXX',
                             help='path to pre-trained decoder embedding')
         parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
                             help='decoder embedding dimension')

Mutant 2640

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -119,7 +119,7 @@
         parser.add_argument('--encoder-learned-pos', action='store_true',
                             help='use learned positional embeddings in the encoder')
         parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
-                            help='path to pre-trained decoder embedding')
+                            help='XXpath to pre-trained decoder embeddingXX')
         parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
                             help='decoder embedding dimension')
         parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',

Mutant 2641

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -120,7 +120,7 @@
                             help='use learned positional embeddings in the encoder')
         parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
                             help='path to pre-trained decoder embedding')
-        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
+        parser.add_argument('XX--decoder-embed-dimXX', type=int, metavar='N',
                             help='decoder embedding dimension')
         parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
                             help='decoder embedding dimension for FFN')

Mutant 2642

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -120,7 +120,7 @@
                             help='use learned positional embeddings in the encoder')
         parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
                             help='path to pre-trained decoder embedding')
-        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
+        parser.add_argument('--decoder-embed-dim', type=int, metavar='XXNXX',
                             help='decoder embedding dimension')
         parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
                             help='decoder embedding dimension for FFN')

Mutant 2643

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -121,7 +121,7 @@
         parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
                             help='path to pre-trained decoder embedding')
         parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
-                            help='decoder embedding dimension')
+                            help='XXdecoder embedding dimensionXX')
         parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
                             help='decoder embedding dimension for FFN')
         parser.add_argument('--decoder-layers', type=int, metavar='N',

Mutant 2644

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -122,7 +122,7 @@
                             help='path to pre-trained decoder embedding')
         parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
                             help='decoder embedding dimension')
-        parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
+        parser.add_argument('XX--decoder-ffn-embed-dimXX', type=int, metavar='N',
                             help='decoder embedding dimension for FFN')
         parser.add_argument('--decoder-layers', type=int, metavar='N',
                             help='num decoder layers')

Mutant 2645

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -122,7 +122,7 @@
                             help='path to pre-trained decoder embedding')
         parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
                             help='decoder embedding dimension')
-        parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
+        parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='XXNXX',
                             help='decoder embedding dimension for FFN')
         parser.add_argument('--decoder-layers', type=int, metavar='N',
                             help='num decoder layers')

Mutant 2646

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -123,7 +123,7 @@
         parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
                             help='decoder embedding dimension')
         parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
-                            help='decoder embedding dimension for FFN')
+                            help='XXdecoder embedding dimension for FFNXX')
         parser.add_argument('--decoder-layers', type=int, metavar='N',
                             help='num decoder layers')
         parser.add_argument('--decoder-attention-heads', type=int, metavar='N',

Mutant 2647

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -124,7 +124,7 @@
                             help='decoder embedding dimension')
         parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
                             help='decoder embedding dimension for FFN')
-        parser.add_argument('--decoder-layers', type=int, metavar='N',
+        parser.add_argument('XX--decoder-layersXX', type=int, metavar='N',
                             help='num decoder layers')
         parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
                             help='num decoder attention heads')

Mutant 2648

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -124,7 +124,7 @@
                             help='decoder embedding dimension')
         parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
                             help='decoder embedding dimension for FFN')
-        parser.add_argument('--decoder-layers', type=int, metavar='N',
+        parser.add_argument('--decoder-layers', type=int, metavar='XXNXX',
                             help='num decoder layers')
         parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
                             help='num decoder attention heads')

Mutant 2649

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -125,7 +125,7 @@
         parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
                             help='decoder embedding dimension for FFN')
         parser.add_argument('--decoder-layers', type=int, metavar='N',
-                            help='num decoder layers')
+                            help='XXnum decoder layersXX')
         parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
                             help='num decoder attention heads')
         parser.add_argument('--decoder-learned-pos', action='store_true',

Mutant 2650

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -126,7 +126,7 @@
                             help='decoder embedding dimension for FFN')
         parser.add_argument('--decoder-layers', type=int, metavar='N',
                             help='num decoder layers')
-        parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
+        parser.add_argument('XX--decoder-attention-headsXX', type=int, metavar='N',
                             help='num decoder attention heads')
         parser.add_argument('--decoder-learned-pos', action='store_true',
                             help='use learned positional embeddings in the decoder')

Mutant 2651

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -126,7 +126,7 @@
                             help='decoder embedding dimension for FFN')
         parser.add_argument('--decoder-layers', type=int, metavar='N',
                             help='num decoder layers')
-        parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
+        parser.add_argument('--decoder-attention-heads', type=int, metavar='XXNXX',
                             help='num decoder attention heads')
         parser.add_argument('--decoder-learned-pos', action='store_true',
                             help='use learned positional embeddings in the decoder')

Mutant 2652

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -127,7 +127,7 @@
         parser.add_argument('--decoder-layers', type=int, metavar='N',
                             help='num decoder layers')
         parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
-                            help='num decoder attention heads')
+                            help='XXnum decoder attention headsXX')
         parser.add_argument('--decoder-learned-pos', action='store_true',
                             help='use learned positional embeddings in the decoder')
         parser.add_argument('--decoder-normalize-before', action='store_true',

Mutant 2653

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -128,7 +128,7 @@
                             help='num decoder layers')
         parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
                             help='num decoder attention heads')
-        parser.add_argument('--decoder-learned-pos', action='store_true',
+        parser.add_argument('XX--decoder-learned-posXX', action='store_true',
                             help='use learned positional embeddings in the decoder')
         parser.add_argument('--decoder-normalize-before', action='store_true',
                             help='apply layernorm before each decoder block')

Mutant 2655

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -129,7 +129,7 @@
         parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
                             help='num decoder attention heads')
         parser.add_argument('--decoder-learned-pos', action='store_true',
-                            help='use learned positional embeddings in the decoder')
+                            help='XXuse learned positional embeddings in the decoderXX')
         parser.add_argument('--decoder-normalize-before', action='store_true',
                             help='apply layernorm before each decoder block')
         parser.add_argument('--decoder-output-dim', type=int, metavar='N',

Mutant 2656

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -130,7 +130,7 @@
                             help='num decoder attention heads')
         parser.add_argument('--decoder-learned-pos', action='store_true',
                             help='use learned positional embeddings in the decoder')
-        parser.add_argument('--decoder-normalize-before', action='store_true',
+        parser.add_argument('XX--decoder-normalize-beforeXX', action='store_true',
                             help='apply layernorm before each decoder block')
         parser.add_argument('--decoder-output-dim', type=int, metavar='N',
                             help='decoder output dimension (extra linear layer '

Mutant 2658

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -131,7 +131,7 @@
         parser.add_argument('--decoder-learned-pos', action='store_true',
                             help='use learned positional embeddings in the decoder')
         parser.add_argument('--decoder-normalize-before', action='store_true',
-                            help='apply layernorm before each decoder block')
+                            help='XXapply layernorm before each decoder blockXX')
         parser.add_argument('--decoder-output-dim', type=int, metavar='N',
                             help='decoder output dimension (extra linear layer '
                                  'if different from decoder embed dim')

Mutant 2659

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -132,7 +132,7 @@
                             help='use learned positional embeddings in the decoder')
         parser.add_argument('--decoder-normalize-before', action='store_true',
                             help='apply layernorm before each decoder block')
-        parser.add_argument('--decoder-output-dim', type=int, metavar='N',
+        parser.add_argument('XX--decoder-output-dimXX', type=int, metavar='N',
                             help='decoder output dimension (extra linear layer '
                                  'if different from decoder embed dim')
         parser.add_argument('--share-decoder-input-output-embed', action='store_true',

Mutant 2660

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -132,7 +132,7 @@
                             help='use learned positional embeddings in the decoder')
         parser.add_argument('--decoder-normalize-before', action='store_true',
                             help='apply layernorm before each decoder block')
-        parser.add_argument('--decoder-output-dim', type=int, metavar='N',
+        parser.add_argument('--decoder-output-dim', type=int, metavar='XXNXX',
                             help='decoder output dimension (extra linear layer '
                                  'if different from decoder embed dim')
         parser.add_argument('--share-decoder-input-output-embed', action='store_true',

Mutant 2661

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -133,7 +133,7 @@
         parser.add_argument('--decoder-normalize-before', action='store_true',
                             help='apply layernorm before each decoder block')
         parser.add_argument('--decoder-output-dim', type=int, metavar='N',
-                            help='decoder output dimension (extra linear layer '
+                            help='XXdecoder output dimension (extra linear layer XX'
                                  'if different from decoder embed dim')
         parser.add_argument('--share-decoder-input-output-embed', action='store_true',
                             help='share decoder input and output embeddings')

Mutant 2662

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -135,7 +135,7 @@
         parser.add_argument('--decoder-output-dim', type=int, metavar='N',
                             help='decoder output dimension (extra linear layer '
                                  'if different from decoder embed dim')
-        parser.add_argument('--share-decoder-input-output-embed', action='store_true',
+        parser.add_argument('XX--share-decoder-input-output-embedXX', action='store_true',
                             help='share decoder input and output embeddings')
         parser.add_argument('--share-all-embeddings', action='store_true',
                             help='share encoder, decoder and output embeddings'

Mutant 2664

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -136,7 +136,7 @@
                             help='decoder output dimension (extra linear layer '
                                  'if different from decoder embed dim')
         parser.add_argument('--share-decoder-input-output-embed', action='store_true',
-                            help='share decoder input and output embeddings')
+                            help='XXshare decoder input and output embeddingsXX')
         parser.add_argument('--share-all-embeddings', action='store_true',
                             help='share encoder, decoder and output embeddings'
                                  ' (requires shared dictionary and embed dim)')

Mutant 2665

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -137,7 +137,7 @@
                                  'if different from decoder embed dim')
         parser.add_argument('--share-decoder-input-output-embed', action='store_true',
                             help='share decoder input and output embeddings')
-        parser.add_argument('--share-all-embeddings', action='store_true',
+        parser.add_argument('XX--share-all-embeddingsXX', action='store_true',
                             help='share encoder, decoder and output embeddings'
                                  ' (requires shared dictionary and embed dim)')
         parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',

Mutant 2667

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -138,7 +138,7 @@
         parser.add_argument('--share-decoder-input-output-embed', action='store_true',
                             help='share decoder input and output embeddings')
         parser.add_argument('--share-all-embeddings', action='store_true',
-                            help='share encoder, decoder and output embeddings'
+                            help='XXshare encoder, decoder and output embeddingsXX'
                                  ' (requires shared dictionary and embed dim)')
         parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
                             help='if set, disables positional embeddings (outside self attention)')

Mutant 2668

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -140,7 +140,7 @@
         parser.add_argument('--share-all-embeddings', action='store_true',
                             help='share encoder, decoder and output embeddings'
                                  ' (requires shared dictionary and embed dim)')
-        parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
+        parser.add_argument('XX--no-token-positional-embeddingsXX', default=False, action='store_true',
                             help='if set, disables positional embeddings (outside self attention)')
         parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
                             help='comma separated list of adaptive softmax cutoff points. '

Mutant 2669

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -140,7 +140,7 @@
         parser.add_argument('--share-all-embeddings', action='store_true',
                             help='share encoder, decoder and output embeddings'
                                  ' (requires shared dictionary and embed dim)')
-        parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
+        parser.add_argument('--no-token-positional-embeddings', default=True, action='store_true',
                             help='if set, disables positional embeddings (outside self attention)')
         parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
                             help='comma separated list of adaptive softmax cutoff points. '

Mutant 2671

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -141,7 +141,7 @@
                             help='share encoder, decoder and output embeddings'
                                  ' (requires shared dictionary and embed dim)')
         parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
-                            help='if set, disables positional embeddings (outside self attention)')
+                            help='XXif set, disables positional embeddings (outside self attention)XX')
         parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
                             help='comma separated list of adaptive softmax cutoff points. '
                                  'Must be used with adaptive_loss criterion'),

Mutant 2672

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -142,7 +142,7 @@
                                  ' (requires shared dictionary and embed dim)')
         parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
                             help='if set, disables positional embeddings (outside self attention)')
-        parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
+        parser.add_argument('XX--adaptive-softmax-cutoffXX', metavar='EXPR',
                             help='comma separated list of adaptive softmax cutoff points. '
                                  'Must be used with adaptive_loss criterion'),
         parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',

Mutant 2673

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -142,7 +142,7 @@
                                  ' (requires shared dictionary and embed dim)')
         parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
                             help='if set, disables positional embeddings (outside self attention)')
-        parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
+        parser.add_argument('--adaptive-softmax-cutoff', metavar='XXEXPRXX',
                             help='comma separated list of adaptive softmax cutoff points. '
                                  'Must be used with adaptive_loss criterion'),
         parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',

Mutant 2674

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -143,7 +143,7 @@
         parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
                             help='if set, disables positional embeddings (outside self attention)')
         parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
-                            help='comma separated list of adaptive softmax cutoff points. '
+                            help='XXcomma separated list of adaptive softmax cutoff points. XX'
                                  'Must be used with adaptive_loss criterion'),
         parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
                             help='sets adaptive softmax dropout for the tail projections')

Mutant 2675

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -145,7 +145,7 @@
         parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
                             help='comma separated list of adaptive softmax cutoff points. '
                                  'Must be used with adaptive_loss criterion'),
-        parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
+        parser.add_argument('XX--adaptive-softmax-dropoutXX', type=float, metavar='D',
                             help='sets adaptive softmax dropout for the tail projections')
         parser.add_argument('--layernorm-embedding', action='store_true',
                             help='add layernorm to embedding')

Mutant 2676

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -145,7 +145,7 @@
         parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
                             help='comma separated list of adaptive softmax cutoff points. '
                                  'Must be used with adaptive_loss criterion'),
-        parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
+        parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='XXDXX',
                             help='sets adaptive softmax dropout for the tail projections')
         parser.add_argument('--layernorm-embedding', action='store_true',
                             help='add layernorm to embedding')

Mutant 2677

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -146,7 +146,7 @@
                             help='comma separated list of adaptive softmax cutoff points. '
                                  'Must be used with adaptive_loss criterion'),
         parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
-                            help='sets adaptive softmax dropout for the tail projections')
+                            help='XXsets adaptive softmax dropout for the tail projectionsXX')
         parser.add_argument('--layernorm-embedding', action='store_true',
                             help='add layernorm to embedding')
         parser.add_argument('--no-scale-embedding', action='store_true',

Mutant 2678

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -147,7 +147,7 @@
                                  'Must be used with adaptive_loss criterion'),
         parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
                             help='sets adaptive softmax dropout for the tail projections')
-        parser.add_argument('--layernorm-embedding', action='store_true',
+        parser.add_argument('XX--layernorm-embeddingXX', action='store_true',
                             help='add layernorm to embedding')
         parser.add_argument('--no-scale-embedding', action='store_true',
                             help='if True, dont scale embeddings')

Mutant 2680

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -148,7 +148,7 @@
         parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
                             help='sets adaptive softmax dropout for the tail projections')
         parser.add_argument('--layernorm-embedding', action='store_true',
-                            help='add layernorm to embedding')
+                            help='XXadd layernorm to embeddingXX')
         parser.add_argument('--no-scale-embedding', action='store_true',
                             help='if True, dont scale embeddings')
         # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019)

Mutant 2681

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -149,7 +149,7 @@
                             help='sets adaptive softmax dropout for the tail projections')
         parser.add_argument('--layernorm-embedding', action='store_true',
                             help='add layernorm to embedding')
-        parser.add_argument('--no-scale-embedding', action='store_true',
+        parser.add_argument('XX--no-scale-embeddingXX', action='store_true',
                             help='if True, dont scale embeddings')
         # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019)
         parser.add_argument('--no-cross-attention', default=False, action='store_true',

Mutant 2683

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -150,7 +150,7 @@
         parser.add_argument('--layernorm-embedding', action='store_true',
                             help='add layernorm to embedding')
         parser.add_argument('--no-scale-embedding', action='store_true',
-                            help='if True, dont scale embeddings')
+                            help='XXif True, dont scale embeddingsXX')
         # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019)
         parser.add_argument('--no-cross-attention', default=False, action='store_true',
                             help='do not perform cross-attention')

Mutant 2684

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -152,7 +152,7 @@
         parser.add_argument('--no-scale-embedding', action='store_true',
                             help='if True, dont scale embeddings')
         # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019)
-        parser.add_argument('--no-cross-attention', default=False, action='store_true',
+        parser.add_argument('XX--no-cross-attentionXX', default=False, action='store_true',
                             help='do not perform cross-attention')
         parser.add_argument('--cross-self-attention', default=False, action='store_true',
                             help='perform cross+self-attention')

Mutant 2685

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -152,7 +152,7 @@
         parser.add_argument('--no-scale-embedding', action='store_true',
                             help='if True, dont scale embeddings')
         # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019)
-        parser.add_argument('--no-cross-attention', default=False, action='store_true',
+        parser.add_argument('--no-cross-attention', default=True, action='store_true',
                             help='do not perform cross-attention')
         parser.add_argument('--cross-self-attention', default=False, action='store_true',
                             help='perform cross+self-attention')

Mutant 2687

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -153,7 +153,7 @@
                             help='if True, dont scale embeddings')
         # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019)
         parser.add_argument('--no-cross-attention', default=False, action='store_true',
-                            help='do not perform cross-attention')
+                            help='XXdo not perform cross-attentionXX')
         parser.add_argument('--cross-self-attention', default=False, action='store_true',
                             help='perform cross+self-attention')
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)

Mutant 2688

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -154,7 +154,7 @@
         # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019)
         parser.add_argument('--no-cross-attention', default=False, action='store_true',
                             help='do not perform cross-attention')
-        parser.add_argument('--cross-self-attention', default=False, action='store_true',
+        parser.add_argument('XX--cross-self-attentionXX', default=False, action='store_true',
                             help='perform cross+self-attention')
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
         parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,

Mutant 2689

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -154,7 +154,7 @@
         # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019)
         parser.add_argument('--no-cross-attention', default=False, action='store_true',
                             help='do not perform cross-attention')
-        parser.add_argument('--cross-self-attention', default=False, action='store_true',
+        parser.add_argument('--cross-self-attention', default=True, action='store_true',
                             help='perform cross+self-attention')
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
         parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,

Mutant 2691

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -155,7 +155,7 @@
         parser.add_argument('--no-cross-attention', default=False, action='store_true',
                             help='do not perform cross-attention')
         parser.add_argument('--cross-self-attention', default=False, action='store_true',
-                            help='perform cross+self-attention')
+                            help='XXperform cross+self-attentionXX')
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
         parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
                             help='LayerDrop probability for encoder')

Mutant 2692

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -157,7 +157,7 @@
         parser.add_argument('--cross-self-attention', default=False, action='store_true',
                             help='perform cross+self-attention')
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
-        parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
+        parser.add_argument('XX--encoder-layerdropXX', type=float, metavar='D', default=0,
                             help='LayerDrop probability for encoder')
         parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
                             help='LayerDrop probability for decoder')

Mutant 2693

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -157,7 +157,7 @@
         parser.add_argument('--cross-self-attention', default=False, action='store_true',
                             help='perform cross+self-attention')
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
-        parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
+        parser.add_argument('--encoder-layerdrop', type=float, metavar='XXDXX', default=0,
                             help='LayerDrop probability for encoder')
         parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
                             help='LayerDrop probability for decoder')

Mutant 2694

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -157,7 +157,7 @@
         parser.add_argument('--cross-self-attention', default=False, action='store_true',
                             help='perform cross+self-attention')
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
-        parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
+        parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=1,
                             help='LayerDrop probability for encoder')
         parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
                             help='LayerDrop probability for decoder')

Mutant 2695

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -158,7 +158,7 @@
                             help='perform cross+self-attention')
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
         parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
-                            help='LayerDrop probability for encoder')
+                            help='XXLayerDrop probability for encoderXX')
         parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
                             help='LayerDrop probability for decoder')
         parser.add_argument('--encoder-layers-to-keep', default=None,

Mutant 2696

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -159,7 +159,7 @@
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
         parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
                             help='LayerDrop probability for encoder')
-        parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
+        parser.add_argument('XX--decoder-layerdropXX', type=float, metavar='D', default=0,
                             help='LayerDrop probability for decoder')
         parser.add_argument('--encoder-layers-to-keep', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')

Mutant 2697

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -159,7 +159,7 @@
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
         parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
                             help='LayerDrop probability for encoder')
-        parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
+        parser.add_argument('--decoder-layerdrop', type=float, metavar='XXDXX', default=0,
                             help='LayerDrop probability for decoder')
         parser.add_argument('--encoder-layers-to-keep', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')

Mutant 2698

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -159,7 +159,7 @@
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
         parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
                             help='LayerDrop probability for encoder')
-        parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
+        parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=1,
                             help='LayerDrop probability for decoder')
         parser.add_argument('--encoder-layers-to-keep', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')

Mutant 2699

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -160,7 +160,7 @@
         parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
                             help='LayerDrop probability for encoder')
         parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
-                            help='LayerDrop probability for decoder')
+                            help='XXLayerDrop probability for decoderXX')
         parser.add_argument('--encoder-layers-to-keep', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')
         parser.add_argument('--decoder-layers-to-keep', default=None,

Mutant 2700

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -161,7 +161,7 @@
                             help='LayerDrop probability for encoder')
         parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
                             help='LayerDrop probability for decoder')
-        parser.add_argument('--encoder-layers-to-keep', default=None,
+        parser.add_argument('XX--encoder-layers-to-keepXX', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')
         parser.add_argument('--decoder-layers-to-keep', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')

Mutant 2701

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -162,7 +162,7 @@
         parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
                             help='LayerDrop probability for decoder')
         parser.add_argument('--encoder-layers-to-keep', default=None,
-                            help='which layers to *keep* when pruning as a comma-separated list')
+                            help='XXwhich layers to *keep* when pruning as a comma-separated listXX')
         parser.add_argument('--decoder-layers-to-keep', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)

Mutant 2702

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -163,7 +163,7 @@
                             help='LayerDrop probability for decoder')
         parser.add_argument('--encoder-layers-to-keep', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')
-        parser.add_argument('--decoder-layers-to-keep', default=None,
+        parser.add_argument('XX--decoder-layers-to-keepXX', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
         parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,

Mutant 2703

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -164,7 +164,7 @@
         parser.add_argument('--encoder-layers-to-keep', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')
         parser.add_argument('--decoder-layers-to-keep', default=None,
-                            help='which layers to *keep* when pruning as a comma-separated list')
+                            help='XXwhich layers to *keep* when pruning as a comma-separated listXX')
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
         parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
                             help='iterative PQ quantization noise at training time')

Mutant 2704

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -166,7 +166,7 @@
         parser.add_argument('--decoder-layers-to-keep', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
-        parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
+        parser.add_argument('XX--quant-noise-pqXX', type=float, metavar='D', default=0,
                             help='iterative PQ quantization noise at training time')
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')

Mutant 2705

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -166,7 +166,7 @@
         parser.add_argument('--decoder-layers-to-keep', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
-        parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
+        parser.add_argument('--quant-noise-pq', type=float, metavar='XXDXX', default=0,
                             help='iterative PQ quantization noise at training time')
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')

Mutant 2706

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -166,7 +166,7 @@
         parser.add_argument('--decoder-layers-to-keep', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
-        parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
+        parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=1,
                             help='iterative PQ quantization noise at training time')
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')

Mutant 2707

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -167,7 +167,7 @@
                             help='which layers to *keep* when pruning as a comma-separated list')
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
         parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
-                            help='iterative PQ quantization noise at training time')
+                            help='XXiterative PQ quantization noise at training timeXX')
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')
         parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,

Mutant 2708

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -168,7 +168,7 @@
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
         parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
                             help='iterative PQ quantization noise at training time')
-        parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
+        parser.add_argument('XX--quant-noise-pq-block-sizeXX', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')
         parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
                             help='scalar quantization noise and scalar quantization at training time')

Mutant 2709

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -168,7 +168,7 @@
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
         parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
                             help='iterative PQ quantization noise at training time')
-        parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
+        parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='XXDXX', default=8,
                             help='block size of quantization noise at training time')
         parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
                             help='scalar quantization noise and scalar quantization at training time')

Mutant 2710

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -168,7 +168,7 @@
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
         parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
                             help='iterative PQ quantization noise at training time')
-        parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
+        parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=9,
                             help='block size of quantization noise at training time')
         parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
                             help='scalar quantization noise and scalar quantization at training time')

Mutant 2711

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -169,7 +169,7 @@
         parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
                             help='iterative PQ quantization noise at training time')
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
-                            help='block size of quantization noise at training time')
+                            help='XXblock size of quantization noise at training timeXX')
         parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
                             help='scalar quantization noise and scalar quantization at training time')
         # fmt: on

Mutant 2712

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -170,7 +170,7 @@
                             help='iterative PQ quantization noise at training time')
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')
-        parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
+        parser.add_argument('XX--quant-noise-scalarXX', type=float, metavar='D', default=0,
                             help='scalar quantization noise and scalar quantization at training time')
         # fmt: on
 

Mutant 2713

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -170,7 +170,7 @@
                             help='iterative PQ quantization noise at training time')
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')
-        parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
+        parser.add_argument('--quant-noise-scalar', type=float, metavar='XXDXX', default=0,
                             help='scalar quantization noise and scalar quantization at training time')
         # fmt: on
 

Mutant 2714

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -170,7 +170,7 @@
                             help='iterative PQ quantization noise at training time')
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')
-        parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
+        parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=1,
                             help='scalar quantization noise and scalar quantization at training time')
         # fmt: on
 

Mutant 2715

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -171,7 +171,7 @@
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')
         parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
-                            help='scalar quantization noise and scalar quantization at training time')
+                            help='XXscalar quantization noise and scalar quantization at training timeXX')
         # fmt: on
 
     @classmethod

Mutant 2716

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -172,9 +172,7 @@
                             help='block size of quantization noise at training time')
         parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
                             help='scalar quantization noise and scalar quantization at training time')
-        # fmt: on
-
-    @classmethod
+
     def build_model(cls, args, task):
         """Build a new model instance."""
 

Mutant 2717

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -223,7 +223,6 @@
         decoder = cls.build_decoder(args, tgt_dict, decoder_embed_tokens)
         return cls(args, encoder, decoder)
 
-    @classmethod
     def build_embedding(cls, args, dictionary, embed_dim, path=None):
         num_embeddings = len(dictionary)
         padding_idx = dictionary.pad()

Mutant 2718

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -235,7 +235,6 @@
             utils.load_embedding(embed_dict, dictionary, emb)
         return emb
 
-    @classmethod
     def build_encoder(cls, args, src_dict, embed_tokens):
         return TransformerEncoder(args, src_dict, embed_tokens)
 

Mutant 2719

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -239,7 +239,6 @@
     def build_encoder(cls, args, src_dict, embed_tokens):
         return TransformerEncoder(args, src_dict, embed_tokens)
 
-    @classmethod
     def build_decoder(cls, args, tgt_dict, embed_tokens):
         return TransformerDecoder(
             args,

Mutant 2720

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -280,10 +280,6 @@
         )
         return decoder_out
 
-    # Since get_normalized_probs is in the Fairseq Model which is not scriptable,
-    # I rewrite the get_normalized_probs from Base Class to call the
-    # helper function in the Base Class.
-    @torch.jit.export
     def get_normalized_probs(
         self,
         net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]],

Mutant 2721

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -374,7 +374,7 @@
             x = self.quant_noise(x)
         return x, embed
 
-    def forward(self, src_tokens, src_lengths, return_all_hiddens: bool = False):
+    def forward(self, src_tokens, src_lengths, return_all_hiddens: bool = True):
         """
         Args:
             src_tokens (LongTensor): tokens in the source language of shape

Mutant 2722

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -425,7 +425,6 @@
             src_lengths=None,
         )
 
-    @torch.jit.export
     def reorder_encoder_out(self, encoder_out: EncoderOut, new_order):
         """
         Reorder encoder output according to *new_order*.

Mutant 2723

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -526,7 +526,7 @@
             (default: False).
     """
 
-    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
+    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=True):
         self.args = args
         super().__init__(dictionary)
         self.register_buffer("version", torch.Tensor([3]))

Mutant 2724

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -633,7 +633,7 @@
                 self.output_projection.weight, mean=0, std=self.output_embed_dim ** -0.5
             )
 
-    def build_decoder_layer(self, args, no_encoder_attn=False):
+    def build_decoder_layer(self, args, no_encoder_attn=True):
         return TransformerDecoderLayer(args, no_encoder_attn)
 
     def forward(

Mutant 2725

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -645,7 +645,7 @@
         alignment_layer: Optional[int] = None,
         alignment_heads: Optional[int] = None,
         src_lengths: Optional[Any] = None,
-        return_all_hiddens: bool = False,
+        return_all_hiddens: bool = True,
     ):
         """
         Args:

Mutant 2726

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -891,7 +891,7 @@
     return m
 
 
-def Linear(in_features, out_features, bias=True):
+def Linear(in_features, out_features, bias=False):
     m = nn.Linear(in_features, out_features, bias)
     nn.init.xavier_uniform_(m.weight)
     if bias:

Mutant 2728

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -899,7 +899,7 @@
     return m
 
 
-@register_model_architecture("transformer", "transformer")
+@register_model_architecture("transformer", "XXtransformerXX")
 def base_architecture(args):
     args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
     args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)

Mutant 2729

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -898,8 +898,6 @@
         nn.init.constant_(m.bias, 0.0)
     return m
 
-
-@register_model_architecture("transformer", "transformer")
 def base_architecture(args):
     args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
     args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)

Mutant 2731

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -944,7 +944,7 @@
     args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False)
 
 
-@register_model_architecture("transformer", "transformer_iwslt_de_en")
+@register_model_architecture("transformer", "XXtransformer_iwslt_de_enXX")
 def transformer_iwslt_de_en(args):
     args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
     args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1024)

Mutant 2732

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -943,8 +943,6 @@
     args.layernorm_embedding = getattr(args, "layernorm_embedding", False)
     args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False)
 
-
-@register_model_architecture("transformer", "transformer_iwslt_de_en")
 def transformer_iwslt_de_en(args):
     args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
     args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1024)

Mutant 2734

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -957,7 +957,7 @@
     base_architecture(args)
 
 
-@register_model_architecture("transformer", "transformer_wmt_en_de")
+@register_model_architecture("transformer", "XXtransformer_wmt_en_deXX")
 def transformer_wmt_en_de(args):
     base_architecture(args)
 

Mutant 2735

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -956,8 +956,6 @@
     args.decoder_layers = getattr(args, "decoder_layers", 6)
     base_architecture(args)
 
-
-@register_model_architecture("transformer", "transformer_wmt_en_de")
 def transformer_wmt_en_de(args):
     base_architecture(args)
 

Mutant 2737

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -963,7 +963,7 @@
 
 
 # parameters used in the "Attention Is All You Need" paper (Vaswani et al., 2017)
-@register_model_architecture("transformer", "transformer_vaswani_wmt_en_de_big")
+@register_model_architecture("transformer", "XXtransformer_vaswani_wmt_en_de_bigXX")
 def transformer_vaswani_wmt_en_de_big(args):
     args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
     args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)

Mutant 2738

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -961,9 +961,6 @@
 def transformer_wmt_en_de(args):
     base_architecture(args)
 
-
-# parameters used in the "Attention Is All You Need" paper (Vaswani et al., 2017)
-@register_model_architecture("transformer", "transformer_vaswani_wmt_en_de_big")
 def transformer_vaswani_wmt_en_de_big(args):
     args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
     args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)

Mutant 2740

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -976,7 +976,7 @@
     base_architecture(args)
 
 
-@register_model_architecture("transformer", "transformer_vaswani_wmt_en_fr_big")
+@register_model_architecture("transformer", "XXtransformer_vaswani_wmt_en_fr_bigXX")
 def transformer_vaswani_wmt_en_fr_big(args):
     args.dropout = getattr(args, "dropout", 0.1)
     transformer_vaswani_wmt_en_de_big(args)

Mutant 2741

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -975,8 +975,6 @@
     args.dropout = getattr(args, "dropout", 0.3)
     base_architecture(args)
 
-
-@register_model_architecture("transformer", "transformer_vaswani_wmt_en_fr_big")
 def transformer_vaswani_wmt_en_fr_big(args):
     args.dropout = getattr(args, "dropout", 0.1)
     transformer_vaswani_wmt_en_de_big(args)

Mutant 2743

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -982,7 +982,7 @@
     transformer_vaswani_wmt_en_de_big(args)
 
 
-@register_model_architecture("transformer", "transformer_wmt_en_de_big")
+@register_model_architecture("transformer", "XXtransformer_wmt_en_de_bigXX")
 def transformer_wmt_en_de_big(args):
     args.attention_dropout = getattr(args, "attention_dropout", 0.1)
     transformer_vaswani_wmt_en_de_big(args)

Mutant 2744

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -981,8 +981,6 @@
     args.dropout = getattr(args, "dropout", 0.1)
     transformer_vaswani_wmt_en_de_big(args)
 
-
-@register_model_architecture("transformer", "transformer_wmt_en_de_big")
 def transformer_wmt_en_de_big(args):
     args.attention_dropout = getattr(args, "attention_dropout", 0.1)
     transformer_vaswani_wmt_en_de_big(args)

Mutant 2746

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -989,7 +989,7 @@
 
 
 # default parameters used in tensor2tensor implementation
-@register_model_architecture("transformer", "transformer_wmt_en_de_big_t2t")
+@register_model_architecture("transformer", "XXtransformer_wmt_en_de_big_t2tXX")
 def transformer_wmt_en_de_big_t2t(args):
     args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True)
     args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True)

Mutant 2747

--- fairseq/models/transformer.py
+++ fairseq/models/transformer.py
@@ -987,9 +987,6 @@
     args.attention_dropout = getattr(args, "attention_dropout", 0.1)
     transformer_vaswani_wmt_en_de_big(args)
 
-
-# default parameters used in tensor2tensor implementation
-@register_model_architecture("transformer", "transformer_wmt_en_de_big_t2t")
 def transformer_wmt_en_de_big_t2t(args):
     args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True)
     args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True)