fairseq/models/roberta/model.py

Killed 9 out of 87 mutants

Survived

Survived mutation testing. These mutants show holes in your test suite.

Mutant 2852

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -29,7 +29,7 @@
 from .hub_interface import RobertaHubInterface
 
 
-logger = logging.getLogger(__name__)
+logger = None
 
 
 @register_model('roberta')

Mutant 2855

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -35,7 +35,6 @@
 @register_model('roberta')
 class RobertaModel(FairseqEncoderModel):
 
-    @classmethod
     def hub_models(cls):
         return {
             'roberta.base': 'http://dl.fbaipublicfiles.com/fairseq/models/roberta.base.tar.gz',

Mutant 2856

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -53,7 +53,6 @@
 
         self.classification_heads = nn.ModuleDict()
 
-    @staticmethod
     def add_args(parser):
         """Add model-specific arguments to the parser."""
         parser.add_argument('--encoder-layers', type=int, metavar='L',

Mutant 2857

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -56,7 +56,7 @@
     @staticmethod
     def add_args(parser):
         """Add model-specific arguments to the parser."""
-        parser.add_argument('--encoder-layers', type=int, metavar='L',
+        parser.add_argument('XX--encoder-layersXX', type=int, metavar='L',
                             help='num encoder layers')
         parser.add_argument('--encoder-embed-dim', type=int, metavar='H',
                             help='encoder embedding dimension')

Mutant 2858

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -56,7 +56,7 @@
     @staticmethod
     def add_args(parser):
         """Add model-specific arguments to the parser."""
-        parser.add_argument('--encoder-layers', type=int, metavar='L',
+        parser.add_argument('--encoder-layers', type=int, metavar='XXLXX',
                             help='num encoder layers')
         parser.add_argument('--encoder-embed-dim', type=int, metavar='H',
                             help='encoder embedding dimension')

Mutant 2859

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -57,7 +57,7 @@
     def add_args(parser):
         """Add model-specific arguments to the parser."""
         parser.add_argument('--encoder-layers', type=int, metavar='L',
-                            help='num encoder layers')
+                            help='XXnum encoder layersXX')
         parser.add_argument('--encoder-embed-dim', type=int, metavar='H',
                             help='encoder embedding dimension')
         parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='F',

Mutant 2860

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -58,7 +58,7 @@
         """Add model-specific arguments to the parser."""
         parser.add_argument('--encoder-layers', type=int, metavar='L',
                             help='num encoder layers')
-        parser.add_argument('--encoder-embed-dim', type=int, metavar='H',
+        parser.add_argument('XX--encoder-embed-dimXX', type=int, metavar='H',
                             help='encoder embedding dimension')
         parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='F',
                             help='encoder embedding dimension for FFN')

Mutant 2861

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -58,7 +58,7 @@
         """Add model-specific arguments to the parser."""
         parser.add_argument('--encoder-layers', type=int, metavar='L',
                             help='num encoder layers')
-        parser.add_argument('--encoder-embed-dim', type=int, metavar='H',
+        parser.add_argument('--encoder-embed-dim', type=int, metavar='XXHXX',
                             help='encoder embedding dimension')
         parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='F',
                             help='encoder embedding dimension for FFN')

Mutant 2862

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -59,7 +59,7 @@
         parser.add_argument('--encoder-layers', type=int, metavar='L',
                             help='num encoder layers')
         parser.add_argument('--encoder-embed-dim', type=int, metavar='H',
-                            help='encoder embedding dimension')
+                            help='XXencoder embedding dimensionXX')
         parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='F',
                             help='encoder embedding dimension for FFN')
         parser.add_argument('--encoder-attention-heads', type=int, metavar='A',

Mutant 2863

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -60,7 +60,7 @@
                             help='num encoder layers')
         parser.add_argument('--encoder-embed-dim', type=int, metavar='H',
                             help='encoder embedding dimension')
-        parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='F',
+        parser.add_argument('XX--encoder-ffn-embed-dimXX', type=int, metavar='F',
                             help='encoder embedding dimension for FFN')
         parser.add_argument('--encoder-attention-heads', type=int, metavar='A',
                             help='num encoder attention heads')

Mutant 2864

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -60,7 +60,7 @@
                             help='num encoder layers')
         parser.add_argument('--encoder-embed-dim', type=int, metavar='H',
                             help='encoder embedding dimension')
-        parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='F',
+        parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='XXFXX',
                             help='encoder embedding dimension for FFN')
         parser.add_argument('--encoder-attention-heads', type=int, metavar='A',
                             help='num encoder attention heads')

Mutant 2865

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -61,7 +61,7 @@
         parser.add_argument('--encoder-embed-dim', type=int, metavar='H',
                             help='encoder embedding dimension')
         parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='F',
-                            help='encoder embedding dimension for FFN')
+                            help='XXencoder embedding dimension for FFNXX')
         parser.add_argument('--encoder-attention-heads', type=int, metavar='A',
                             help='num encoder attention heads')
         parser.add_argument('--activation-fn',

Mutant 2866

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -62,7 +62,7 @@
                             help='encoder embedding dimension')
         parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='F',
                             help='encoder embedding dimension for FFN')
-        parser.add_argument('--encoder-attention-heads', type=int, metavar='A',
+        parser.add_argument('XX--encoder-attention-headsXX', type=int, metavar='A',
                             help='num encoder attention heads')
         parser.add_argument('--activation-fn',
                             choices=utils.get_available_activation_fns(),

Mutant 2867

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -62,7 +62,7 @@
                             help='encoder embedding dimension')
         parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='F',
                             help='encoder embedding dimension for FFN')
-        parser.add_argument('--encoder-attention-heads', type=int, metavar='A',
+        parser.add_argument('--encoder-attention-heads', type=int, metavar='XXAXX',
                             help='num encoder attention heads')
         parser.add_argument('--activation-fn',
                             choices=utils.get_available_activation_fns(),

Mutant 2868

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -63,7 +63,7 @@
         parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='F',
                             help='encoder embedding dimension for FFN')
         parser.add_argument('--encoder-attention-heads', type=int, metavar='A',
-                            help='num encoder attention heads')
+                            help='XXnum encoder attention headsXX')
         parser.add_argument('--activation-fn',
                             choices=utils.get_available_activation_fns(),
                             help='activation function to use')

Mutant 2869

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -64,7 +64,7 @@
                             help='encoder embedding dimension for FFN')
         parser.add_argument('--encoder-attention-heads', type=int, metavar='A',
                             help='num encoder attention heads')
-        parser.add_argument('--activation-fn',
+        parser.add_argument('XX--activation-fnXX',
                             choices=utils.get_available_activation_fns(),
                             help='activation function to use')
         parser.add_argument('--pooler-activation-fn',

Mutant 2870

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -66,7 +66,7 @@
                             help='num encoder attention heads')
         parser.add_argument('--activation-fn',
                             choices=utils.get_available_activation_fns(),
-                            help='activation function to use')
+                            help='XXactivation function to useXX')
         parser.add_argument('--pooler-activation-fn',
                             choices=utils.get_available_activation_fns(),
                             help='activation function to use for pooler layer')

Mutant 2871

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -67,7 +67,7 @@
         parser.add_argument('--activation-fn',
                             choices=utils.get_available_activation_fns(),
                             help='activation function to use')
-        parser.add_argument('--pooler-activation-fn',
+        parser.add_argument('XX--pooler-activation-fnXX',
                             choices=utils.get_available_activation_fns(),
                             help='activation function to use for pooler layer')
         parser.add_argument('--encoder-normalize-before', action='store_true',

Mutant 2872

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -69,7 +69,7 @@
                             help='activation function to use')
         parser.add_argument('--pooler-activation-fn',
                             choices=utils.get_available_activation_fns(),
-                            help='activation function to use for pooler layer')
+                            help='XXactivation function to use for pooler layerXX')
         parser.add_argument('--encoder-normalize-before', action='store_true',
                             help='apply layernorm before each encoder block')
         parser.add_argument('--dropout', type=float, metavar='D',

Mutant 2873

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -70,7 +70,7 @@
         parser.add_argument('--pooler-activation-fn',
                             choices=utils.get_available_activation_fns(),
                             help='activation function to use for pooler layer')
-        parser.add_argument('--encoder-normalize-before', action='store_true',
+        parser.add_argument('XX--encoder-normalize-beforeXX', action='store_true',
                             help='apply layernorm before each encoder block')
         parser.add_argument('--dropout', type=float, metavar='D',
                             help='dropout probability')

Mutant 2875

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -71,7 +71,7 @@
                             choices=utils.get_available_activation_fns(),
                             help='activation function to use for pooler layer')
         parser.add_argument('--encoder-normalize-before', action='store_true',
-                            help='apply layernorm before each encoder block')
+                            help='XXapply layernorm before each encoder blockXX')
         parser.add_argument('--dropout', type=float, metavar='D',
                             help='dropout probability')
         parser.add_argument('--attention-dropout', type=float, metavar='D',

Mutant 2876

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -72,7 +72,7 @@
                             help='activation function to use for pooler layer')
         parser.add_argument('--encoder-normalize-before', action='store_true',
                             help='apply layernorm before each encoder block')
-        parser.add_argument('--dropout', type=float, metavar='D',
+        parser.add_argument('XX--dropoutXX', type=float, metavar='D',
                             help='dropout probability')
         parser.add_argument('--attention-dropout', type=float, metavar='D',
                             help='dropout probability for attention weights')

Mutant 2877

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -72,7 +72,7 @@
                             help='activation function to use for pooler layer')
         parser.add_argument('--encoder-normalize-before', action='store_true',
                             help='apply layernorm before each encoder block')
-        parser.add_argument('--dropout', type=float, metavar='D',
+        parser.add_argument('--dropout', type=float, metavar='XXDXX',
                             help='dropout probability')
         parser.add_argument('--attention-dropout', type=float, metavar='D',
                             help='dropout probability for attention weights')

Mutant 2878

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -73,7 +73,7 @@
         parser.add_argument('--encoder-normalize-before', action='store_true',
                             help='apply layernorm before each encoder block')
         parser.add_argument('--dropout', type=float, metavar='D',
-                            help='dropout probability')
+                            help='XXdropout probabilityXX')
         parser.add_argument('--attention-dropout', type=float, metavar='D',
                             help='dropout probability for attention weights')
         parser.add_argument('--activation-dropout', type=float, metavar='D',

Mutant 2879

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -74,7 +74,7 @@
                             help='apply layernorm before each encoder block')
         parser.add_argument('--dropout', type=float, metavar='D',
                             help='dropout probability')
-        parser.add_argument('--attention-dropout', type=float, metavar='D',
+        parser.add_argument('XX--attention-dropoutXX', type=float, metavar='D',
                             help='dropout probability for attention weights')
         parser.add_argument('--activation-dropout', type=float, metavar='D',
                             help='dropout probability after activation in FFN')

Mutant 2880

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -74,7 +74,7 @@
                             help='apply layernorm before each encoder block')
         parser.add_argument('--dropout', type=float, metavar='D',
                             help='dropout probability')
-        parser.add_argument('--attention-dropout', type=float, metavar='D',
+        parser.add_argument('--attention-dropout', type=float, metavar='XXDXX',
                             help='dropout probability for attention weights')
         parser.add_argument('--activation-dropout', type=float, metavar='D',
                             help='dropout probability after activation in FFN')

Mutant 2881

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -75,7 +75,7 @@
         parser.add_argument('--dropout', type=float, metavar='D',
                             help='dropout probability')
         parser.add_argument('--attention-dropout', type=float, metavar='D',
-                            help='dropout probability for attention weights')
+                            help='XXdropout probability for attention weightsXX')
         parser.add_argument('--activation-dropout', type=float, metavar='D',
                             help='dropout probability after activation in FFN')
         parser.add_argument('--pooler-dropout', type=float, metavar='D',

Mutant 2882

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -76,7 +76,7 @@
                             help='dropout probability')
         parser.add_argument('--attention-dropout', type=float, metavar='D',
                             help='dropout probability for attention weights')
-        parser.add_argument('--activation-dropout', type=float, metavar='D',
+        parser.add_argument('XX--activation-dropoutXX', type=float, metavar='D',
                             help='dropout probability after activation in FFN')
         parser.add_argument('--pooler-dropout', type=float, metavar='D',
                             help='dropout probability in the masked_lm pooler layers')

Mutant 2883

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -76,7 +76,7 @@
                             help='dropout probability')
         parser.add_argument('--attention-dropout', type=float, metavar='D',
                             help='dropout probability for attention weights')
-        parser.add_argument('--activation-dropout', type=float, metavar='D',
+        parser.add_argument('--activation-dropout', type=float, metavar='XXDXX',
                             help='dropout probability after activation in FFN')
         parser.add_argument('--pooler-dropout', type=float, metavar='D',
                             help='dropout probability in the masked_lm pooler layers')

Mutant 2884

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -77,7 +77,7 @@
         parser.add_argument('--attention-dropout', type=float, metavar='D',
                             help='dropout probability for attention weights')
         parser.add_argument('--activation-dropout', type=float, metavar='D',
-                            help='dropout probability after activation in FFN')
+                            help='XXdropout probability after activation in FFNXX')
         parser.add_argument('--pooler-dropout', type=float, metavar='D',
                             help='dropout probability in the masked_lm pooler layers')
         parser.add_argument('--max-positions', type=int,

Mutant 2885

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -78,7 +78,7 @@
                             help='dropout probability for attention weights')
         parser.add_argument('--activation-dropout', type=float, metavar='D',
                             help='dropout probability after activation in FFN')
-        parser.add_argument('--pooler-dropout', type=float, metavar='D',
+        parser.add_argument('XX--pooler-dropoutXX', type=float, metavar='D',
                             help='dropout probability in the masked_lm pooler layers')
         parser.add_argument('--max-positions', type=int,
                             help='number of positional embeddings to learn')

Mutant 2886

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -78,7 +78,7 @@
                             help='dropout probability for attention weights')
         parser.add_argument('--activation-dropout', type=float, metavar='D',
                             help='dropout probability after activation in FFN')
-        parser.add_argument('--pooler-dropout', type=float, metavar='D',
+        parser.add_argument('--pooler-dropout', type=float, metavar='XXDXX',
                             help='dropout probability in the masked_lm pooler layers')
         parser.add_argument('--max-positions', type=int,
                             help='number of positional embeddings to learn')

Mutant 2887

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -79,7 +79,7 @@
         parser.add_argument('--activation-dropout', type=float, metavar='D',
                             help='dropout probability after activation in FFN')
         parser.add_argument('--pooler-dropout', type=float, metavar='D',
-                            help='dropout probability in the masked_lm pooler layers')
+                            help='XXdropout probability in the masked_lm pooler layersXX')
         parser.add_argument('--max-positions', type=int,
                             help='number of positional embeddings to learn')
         parser.add_argument('--load-checkpoint-heads', action='store_true',

Mutant 2888

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -80,7 +80,7 @@
                             help='dropout probability after activation in FFN')
         parser.add_argument('--pooler-dropout', type=float, metavar='D',
                             help='dropout probability in the masked_lm pooler layers')
-        parser.add_argument('--max-positions', type=int,
+        parser.add_argument('XX--max-positionsXX', type=int,
                             help='number of positional embeddings to learn')
         parser.add_argument('--load-checkpoint-heads', action='store_true',
                             help='(re-)register and load heads when loading checkpoints')

Mutant 2889

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -81,7 +81,7 @@
         parser.add_argument('--pooler-dropout', type=float, metavar='D',
                             help='dropout probability in the masked_lm pooler layers')
         parser.add_argument('--max-positions', type=int,
-                            help='number of positional embeddings to learn')
+                            help='XXnumber of positional embeddings to learnXX')
         parser.add_argument('--load-checkpoint-heads', action='store_true',
                             help='(re-)register and load heads when loading checkpoints')
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)

Mutant 2890

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -82,7 +82,7 @@
                             help='dropout probability in the masked_lm pooler layers')
         parser.add_argument('--max-positions', type=int,
                             help='number of positional embeddings to learn')
-        parser.add_argument('--load-checkpoint-heads', action='store_true',
+        parser.add_argument('XX--load-checkpoint-headsXX', action='store_true',
                             help='(re-)register and load heads when loading checkpoints')
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
         parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,

Mutant 2892

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -83,7 +83,7 @@
         parser.add_argument('--max-positions', type=int,
                             help='number of positional embeddings to learn')
         parser.add_argument('--load-checkpoint-heads', action='store_true',
-                            help='(re-)register and load heads when loading checkpoints')
+                            help='XX(re-)register and load heads when loading checkpointsXX')
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
         parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
                             help='LayerDrop probability for encoder')

Mutant 2893

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -85,7 +85,7 @@
         parser.add_argument('--load-checkpoint-heads', action='store_true',
                             help='(re-)register and load heads when loading checkpoints')
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
-        parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
+        parser.add_argument('XX--encoder-layerdropXX', type=float, metavar='D', default=0,
                             help='LayerDrop probability for encoder')
         parser.add_argument('--encoder-layers-to-keep', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')

Mutant 2894

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -85,7 +85,7 @@
         parser.add_argument('--load-checkpoint-heads', action='store_true',
                             help='(re-)register and load heads when loading checkpoints')
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
-        parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
+        parser.add_argument('--encoder-layerdrop', type=float, metavar='XXDXX', default=0,
                             help='LayerDrop probability for encoder')
         parser.add_argument('--encoder-layers-to-keep', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')

Mutant 2895

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -85,7 +85,7 @@
         parser.add_argument('--load-checkpoint-heads', action='store_true',
                             help='(re-)register and load heads when loading checkpoints')
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
-        parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
+        parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=1,
                             help='LayerDrop probability for encoder')
         parser.add_argument('--encoder-layers-to-keep', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')

Mutant 2896

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -86,7 +86,7 @@
                             help='(re-)register and load heads when loading checkpoints')
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
         parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
-                            help='LayerDrop probability for encoder')
+                            help='XXLayerDrop probability for encoderXX')
         parser.add_argument('--encoder-layers-to-keep', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)

Mutant 2897

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -87,7 +87,7 @@
         # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
         parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
                             help='LayerDrop probability for encoder')
-        parser.add_argument('--encoder-layers-to-keep', default=None,
+        parser.add_argument('XX--encoder-layers-to-keepXX', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
         parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,

Mutant 2898

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -88,7 +88,7 @@
         parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
                             help='LayerDrop probability for encoder')
         parser.add_argument('--encoder-layers-to-keep', default=None,
-                            help='which layers to *keep* when pruning as a comma-separated list')
+                            help='XXwhich layers to *keep* when pruning as a comma-separated listXX')
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
         parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
                             help='iterative PQ quantization noise at training time')

Mutant 2899

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -90,7 +90,7 @@
         parser.add_argument('--encoder-layers-to-keep', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
-        parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
+        parser.add_argument('XX--quant-noise-pqXX', type=float, metavar='D', default=0,
                             help='iterative PQ quantization noise at training time')
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')

Mutant 2900

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -90,7 +90,7 @@
         parser.add_argument('--encoder-layers-to-keep', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
-        parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
+        parser.add_argument('--quant-noise-pq', type=float, metavar='XXDXX', default=0,
                             help='iterative PQ quantization noise at training time')
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')

Mutant 2901

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -90,7 +90,7 @@
         parser.add_argument('--encoder-layers-to-keep', default=None,
                             help='which layers to *keep* when pruning as a comma-separated list')
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
-        parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
+        parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=1,
                             help='iterative PQ quantization noise at training time')
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')

Mutant 2902

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -91,7 +91,7 @@
                             help='which layers to *keep* when pruning as a comma-separated list')
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
         parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
-                            help='iterative PQ quantization noise at training time')
+                            help='XXiterative PQ quantization noise at training timeXX')
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')
         parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,

Mutant 2903

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -92,7 +92,7 @@
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
         parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
                             help='iterative PQ quantization noise at training time')
-        parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
+        parser.add_argument('XX--quant-noise-pq-block-sizeXX', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')
         parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
                             help='scalar quantization noise and scalar quantization at training time')

Mutant 2904

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -92,7 +92,7 @@
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
         parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
                             help='iterative PQ quantization noise at training time')
-        parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
+        parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='XXDXX', default=8,
                             help='block size of quantization noise at training time')
         parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
                             help='scalar quantization noise and scalar quantization at training time')

Mutant 2905

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -92,7 +92,7 @@
         # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
         parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
                             help='iterative PQ quantization noise at training time')
-        parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
+        parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=9,
                             help='block size of quantization noise at training time')
         parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
                             help='scalar quantization noise and scalar quantization at training time')

Mutant 2906

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -93,7 +93,7 @@
         parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
                             help='iterative PQ quantization noise at training time')
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
-                            help='block size of quantization noise at training time')
+                            help='XXblock size of quantization noise at training timeXX')
         parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
                             help='scalar quantization noise and scalar quantization at training time')
         parser.add_argument('--untie-weights-roberta', action='store_true',

Mutant 2907

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -94,7 +94,7 @@
                             help='iterative PQ quantization noise at training time')
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')
-        parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
+        parser.add_argument('XX--quant-noise-scalarXX', type=float, metavar='D', default=0,
                             help='scalar quantization noise and scalar quantization at training time')
         parser.add_argument('--untie-weights-roberta', action='store_true',
                             help='Untie weights between embeddings and classifiers in RoBERTa')

Mutant 2908

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -94,7 +94,7 @@
                             help='iterative PQ quantization noise at training time')
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')
-        parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
+        parser.add_argument('--quant-noise-scalar', type=float, metavar='XXDXX', default=0,
                             help='scalar quantization noise and scalar quantization at training time')
         parser.add_argument('--untie-weights-roberta', action='store_true',
                             help='Untie weights between embeddings and classifiers in RoBERTa')

Mutant 2909

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -94,7 +94,7 @@
                             help='iterative PQ quantization noise at training time')
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')
-        parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
+        parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=1,
                             help='scalar quantization noise and scalar quantization at training time')
         parser.add_argument('--untie-weights-roberta', action='store_true',
                             help='Untie weights between embeddings and classifiers in RoBERTa')

Mutant 2910

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -95,7 +95,7 @@
         parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
                             help='block size of quantization noise at training time')
         parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
-                            help='scalar quantization noise and scalar quantization at training time')
+                            help='XXscalar quantization noise and scalar quantization at training timeXX')
         parser.add_argument('--untie-weights-roberta', action='store_true',
                             help='Untie weights between embeddings and classifiers in RoBERTa')
 

Mutant 2911

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -96,7 +96,7 @@
                             help='block size of quantization noise at training time')
         parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
                             help='scalar quantization noise and scalar quantization at training time')
-        parser.add_argument('--untie-weights-roberta', action='store_true',
+        parser.add_argument('XX--untie-weights-robertaXX', action='store_true',
                             help='Untie weights between embeddings and classifiers in RoBERTa')
 
     @classmethod

Mutant 2913

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -97,7 +97,7 @@
         parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
                             help='scalar quantization noise and scalar quantization at training time')
         parser.add_argument('--untie-weights-roberta', action='store_true',
-                            help='Untie weights between embeddings and classifiers in RoBERTa')
+                            help='XXUntie weights between embeddings and classifiers in RoBERTaXX')
 
     @classmethod
     def build_model(cls, args, task):

Mutant 2914

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -99,7 +99,6 @@
         parser.add_argument('--untie-weights-roberta', action='store_true',
                             help='Untie weights between embeddings and classifiers in RoBERTa')
 
-    @classmethod
     def build_model(cls, args, task):
         """Build a new model instance."""
 

Mutant 2915

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -112,7 +112,7 @@
         encoder = RobertaEncoder(args, task.source_dictionary)
         return cls(args, encoder)
 
-    def forward(self, src_tokens, features_only=False, return_all_hiddens=False, classification_head_name=None, **kwargs):
+    def forward(self, src_tokens, features_only=True, return_all_hiddens=False, classification_head_name=None, **kwargs):
         if classification_head_name is not None:
             features_only = True
 

Mutant 2916

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -112,7 +112,7 @@
         encoder = RobertaEncoder(args, task.source_dictionary)
         return cls(args, encoder)
 
-    def forward(self, src_tokens, features_only=False, return_all_hiddens=False, classification_head_name=None, **kwargs):
+    def forward(self, src_tokens, features_only=False, return_all_hiddens=True, classification_head_name=None, **kwargs):
         if classification_head_name is not None:
             features_only = True
 

Mutant 2917

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -152,7 +152,6 @@
             self.args.quant_noise_pq_block_size,
         )
 
-    @property
     def supported_targets(self):
         return {'self'}
 

Mutant 2918

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -156,7 +156,6 @@
     def supported_targets(self):
         return {'self'}
 
-    @classmethod
     def from_pretrained(cls, model_name_or_path, checkpoint_file='model.pt', data_name_or_path='.', bpe='gpt2', **kwargs):
         from fairseq import hub_utils
         x = hub_utils.from_pretrained(

Mutant 2919

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -157,7 +157,7 @@
         return {'self'}
 
     @classmethod
-    def from_pretrained(cls, model_name_or_path, checkpoint_file='model.pt', data_name_or_path='.', bpe='gpt2', **kwargs):
+    def from_pretrained(cls, model_name_or_path, checkpoint_file='XXmodel.ptXX', data_name_or_path='.', bpe='gpt2', **kwargs):
         from fairseq import hub_utils
         x = hub_utils.from_pretrained(
             model_name_or_path,

Mutant 2920

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -157,7 +157,7 @@
         return {'self'}
 
     @classmethod
-    def from_pretrained(cls, model_name_or_path, checkpoint_file='model.pt', data_name_or_path='.', bpe='gpt2', **kwargs):
+    def from_pretrained(cls, model_name_or_path, checkpoint_file='model.pt', data_name_or_path='XX.XX', bpe='gpt2', **kwargs):
         from fairseq import hub_utils
         x = hub_utils.from_pretrained(
             model_name_or_path,

Mutant 2921

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -157,7 +157,7 @@
         return {'self'}
 
     @classmethod
-    def from_pretrained(cls, model_name_or_path, checkpoint_file='model.pt', data_name_or_path='.', bpe='gpt2', **kwargs):
+    def from_pretrained(cls, model_name_or_path, checkpoint_file='model.pt', data_name_or_path='.', bpe='XXgpt2XX', **kwargs):
         from fairseq import hub_utils
         x = hub_utils.from_pretrained(
             model_name_or_path,

Mutant 2922

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -260,7 +260,7 @@
 class RobertaClassificationHead(nn.Module):
     """Head for sentence-level classification tasks."""
 
-    def __init__(self, input_dim, inner_dim, num_classes, activation_fn, pooler_dropout, q_noise=0, qn_block_size=8):
+    def __init__(self, input_dim, inner_dim, num_classes, activation_fn, pooler_dropout, q_noise=1, qn_block_size=8):
         super().__init__()
         self.dense = nn.Linear(input_dim, inner_dim)
         self.activation_fn = utils.get_activation_fn(activation_fn)

Mutant 2923

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -260,7 +260,7 @@
 class RobertaClassificationHead(nn.Module):
     """Head for sentence-level classification tasks."""
 
-    def __init__(self, input_dim, inner_dim, num_classes, activation_fn, pooler_dropout, q_noise=0, qn_block_size=8):
+    def __init__(self, input_dim, inner_dim, num_classes, activation_fn, pooler_dropout, q_noise=0, qn_block_size=9):
         super().__init__()
         self.dense = nn.Linear(input_dim, inner_dim)
         self.activation_fn = utils.get_activation_fn(activation_fn)

Mutant 2924

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -317,7 +317,7 @@
             weight=self.sentence_encoder.embed_tokens.weight if not args.untie_weights_roberta else None,
         )
 
-    def forward(self, src_tokens, features_only=False, return_all_hiddens=False, masked_tokens=None, **unused):
+    def forward(self, src_tokens, features_only=True, return_all_hiddens=False, masked_tokens=None, **unused):
         """
         Args:
             src_tokens (LongTensor): input tokens of shape `(batch, src_len)`

Mutant 2925

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -317,7 +317,7 @@
             weight=self.sentence_encoder.embed_tokens.weight if not args.untie_weights_roberta else None,
         )
 
-    def forward(self, src_tokens, features_only=False, return_all_hiddens=False, masked_tokens=None, **unused):
+    def forward(self, src_tokens, features_only=False, return_all_hiddens=True, masked_tokens=None, **unused):
         """
         Args:
             src_tokens (LongTensor): input tokens of shape `(batch, src_len)`

Mutant 2926

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -339,7 +339,7 @@
             x = self.output_layer(x, masked_tokens=masked_tokens)
         return x, extra
 
-    def extract_features(self, src_tokens, return_all_hiddens=False, **unused):
+    def extract_features(self, src_tokens, return_all_hiddens=True, **unused):
         inner_states, _ = self.sentence_encoder(
             src_tokens,
             last_state_only=not return_all_hiddens,

Mutant 2928

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -355,7 +355,7 @@
         return self.args.max_positions
 
 
-@register_model_architecture('roberta', 'roberta')
+@register_model_architecture('roberta', 'XXrobertaXX')
 def base_architecture(args):
     args.encoder_layers = getattr(args, 'encoder_layers', 12)
     args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 768)

Mutant 2929

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -354,8 +354,6 @@
         """Maximum output length supported by the encoder."""
         return self.args.max_positions
 
-
-@register_model_architecture('roberta', 'roberta')
 def base_architecture(args):
     args.encoder_layers = getattr(args, 'encoder_layers', 12)
     args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 768)

Mutant 2931

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -373,7 +373,7 @@
     args.encoder_layerdrop = getattr(args, 'encoder_layerdrop', 0.0)
 
 
-@register_model_architecture('roberta', 'roberta_base')
+@register_model_architecture('roberta', 'XXroberta_baseXX')
 def roberta_base_architecture(args):
     base_architecture(args)
 

Mutant 2932

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -372,8 +372,6 @@
     args.encoder_layers_to_keep = getattr(args, 'encoder_layers_to_keep', None)
     args.encoder_layerdrop = getattr(args, 'encoder_layerdrop', 0.0)
 
-
-@register_model_architecture('roberta', 'roberta_base')
 def roberta_base_architecture(args):
     base_architecture(args)
 

Mutant 2934

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -378,7 +378,7 @@
     base_architecture(args)
 
 
-@register_model_architecture('roberta', 'roberta_large')
+@register_model_architecture('roberta', 'XXroberta_largeXX')
 def roberta_large_architecture(args):
     args.encoder_layers = getattr(args, 'encoder_layers', 24)
     args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 1024)

Mutant 2935

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -377,8 +377,6 @@
 def roberta_base_architecture(args):
     base_architecture(args)
 
-
-@register_model_architecture('roberta', 'roberta_large')
 def roberta_large_architecture(args):
     args.encoder_layers = getattr(args, 'encoder_layers', 24)
     args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 1024)

Mutant 2937

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -387,7 +387,7 @@
     base_architecture(args)
 
 
-@register_model_architecture('roberta', 'xlm')
+@register_model_architecture('roberta', 'XXxlmXX')
 def xlm_architecture(args):
     args.encoder_layers = getattr(args, 'encoder_layers', 16)
     args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 1280)

Mutant 2938

--- fairseq/models/roberta/model.py
+++ fairseq/models/roberta/model.py
@@ -386,8 +386,6 @@
     args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 16)
     base_architecture(args)
 
-
-@register_model_architecture('roberta', 'xlm')
 def xlm_architecture(args):
     args.encoder_layers = getattr(args, 'encoder_layers', 16)
     args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 1280)