Published July 31, 2021 | Version v1
Other Open

ESPnet2 pretrained model, kan-bayashi/ruslan_tts_train_transformer_raw_phn_espeak_ng_russian_train.loss.ave, fs=24000, lang=ru

Creators

Description

This model was trained by kan-bayashi using ruslan/tts1 recipe in espnet.

 

  • Python API
    See https://github.com/espnet/espnet_model_zoo
  • Evaluate in the recipe
    git clone https://github.com/espnet/espnet
    cd espnet
    git checkout 98691b62c37d04fa9f1f38d76ec13c0591d94832
    pip install -e .
    cd egs2/ruslan/tts1
    # Download the model file here
    ./run.sh --skip_data_prep false --skip_train true --download_model kan-bayashi/ruslan_tts_train_transformer_raw_phn_espeak_ng_russian_train.loss.ave
    
  • Config
    config: conf/tuning/train_transformer.yaml
    print_config: false
    log_level: INFO
    dry_run: false
    iterator_type: sequence
    output_dir: exp/tts_train_transformer_raw_phn_none
    ngpu: 1
    seed: 0
    num_workers: 1
    num_att_plot: 3
    dist_backend: nccl
    dist_init_method: env://
    dist_world_size: 4
    dist_rank: 0
    local_rank: 0
    dist_master_addr: localhost
    dist_master_port: 39093
    dist_launcher: null
    multiprocessing_distributed: true
    unused_parameters: false
    sharded_ddp: false
    cudnn_enabled: true
    cudnn_benchmark: false
    cudnn_deterministic: true
    collect_stats: false
    write_collected_feats: false
    max_epoch: 200
    patience: null
    val_scheduler_criterion:
    - valid
    - loss
    early_stopping_criterion:
    - valid
    - loss
    - min
    best_model_criterion:
    -   - valid
        - loss
        - min
    -   - train
        - loss
        - min
    keep_nbest_models: 5
    grad_clip: 1.0
    grad_clip_type: 2.0
    grad_noise: false
    accum_grad: 2
    no_forward_run: false
    resume: true
    train_dtype: float32
    use_amp: false
    log_interval: null
    use_tensorboard: true
    use_wandb: false
    wandb_project: null
    wandb_id: null
    wandb_entity: null
    wandb_name: null
    wandb_model_log_interval: -1
    detect_anomaly: false
    pretrain_path: null
    init_param: []
    ignore_init_mismatch: false
    freeze_param: []
    num_iters_per_epoch: 1000
    batch_size: 20
    valid_batch_size: null
    batch_bins: 9000000
    valid_batch_bins: null
    train_shape_file:
    - exp/tts_stats_raw_phn_none/train/text_shape.phn
    - exp/tts_stats_raw_phn_none/train/speech_shape
    valid_shape_file:
    - exp/tts_stats_raw_phn_none/valid/text_shape.phn
    - exp/tts_stats_raw_phn_none/valid/speech_shape
    batch_type: numel
    valid_batch_type: null
    fold_length:
    - 150
    - 240000
    sort_in_batch: descending
    sort_batch: descending
    multiple_iterator: false
    chunk_length: 500
    chunk_shift_ratio: 0.5
    num_cache_chunks: 1024
    train_data_path_and_name_and_type:
    -   - dump/raw/tr_no_dev_phn/text
        - text
        - text
    -   - dump/raw/tr_no_dev_phn/wav.scp
        - speech
        - sound
    valid_data_path_and_name_and_type:
    -   - dump/raw/dev_phn/text
        - text
        - text
    -   - dump/raw/dev_phn/wav.scp
        - speech
        - sound
    allow_variable_data_keys: false
    max_cache_size: 0.0
    max_cache_fd: 32
    valid_max_cache_size: null
    optim: adam
    optim_conf:
        lr: 1.0
    scheduler: noamlr
    scheduler_conf:
        model_size: 512
        warmup_steps: 8000
    token_list:
    - 
    - 
    - ʌ
    - i
    - n
    - t
    - ˈɑ
    - ˈo
    - s
    - r
    - a
    - ɭ
    - ˈe
    - v
    - k
    - j
    - m
    - ˈi
    - ɪ
    - .
    - p
    - y
    - nʲ
    - tʲ
    - d
    - ɭʲ
    - u
    - sʲ
    - rʲ
    - ','
    - z
    - b
    - tʃʲ
    - ˈu
    - ʃ
    - ɡ
    - ʒ
    - f
    - dʲ
    - vʲ
    - mʲ
    - x
    - ˈy
    - ts
    - kʲ
    - ˈɛ
    - ja
    - pʲ
    - ˈja
    - ju
    - bʲ
    - ɕ
    - e
    - o
    - ʑ
    - ˈɪ
    - …
    - ə
    - '?'
    - ɡʲ
    - ɛ
    - u"
    - fʲ
    - '!'
    - ˈu"
    - ˌy
    - ˈju
    - ».
    - ɪ^
    - ˌu
    - ɑ
    - »
    - ˌo
    - '!..'
    - ?!
    - …»
    - ?..
    - »,
    - «k
    - dʒʲ
    - «p
    - ˌɛ
    - '!»'
    - «s
    - «n
    - «v
    - «t
    - «m
    - »…
    - «d
    - «nʲ
    - «z
    - «b
    - ?»
    - «
    - «a
    - ?!.
    - «ɭʲ
    - ˌɪ
    - «ʌ
    - «ɡ
    - «f
    - '!..»'
    - «ɪ
    - «x
    - «ˈɑ
    - «j
    - ˌa
    - «mʲ
    - »?
    - «pʲ
    - «ʃ
    - «ˈja
    - «ˈo
    - «tʃʲ
    - '...'
    - ˈa
    - ˌɑ
    - «vʲ
    - «dʲ
    - «sʲ
    - «ɛ
    - «tʲ
    - «ˈɛ
    - «bʲ
    - ?..»
    - ˈɵ
    - "
    - ..
    - «fʲ
    - «ʒ
    - «ɭ
    - ",
    - ?!..
    - «ɡʲ
    - '!».'
    - «ˈu
    - ?!»
    - «ts
    - «ju
    - «ˈɪ
    - «u
    - ˌʌ
    - «ʑ
    - «ˈi
    - ˌi
    - «ˈju
    - ?!.»
    - ".
    - .,
    - ?»…
    - «ja
    - «ɕ
    - »!
    - «kʲ
    - «…v
    - ŋ
    - «…s
    - "
    - «…ˈja
    - ?».
    - ;
    - '!»…'
    - …p
    - ˌja
    - …».
    - …»,
    - "».
    - …»?
    - .ˈi
    - …k
    - …nʲ
    - '!»,'
    - »?..
    - «…k
    - …f
    - «…
    - "…
    - «…n
    - …ˈi
    - '!"'
    - .b
    - …t
    - …n
    - …v
    - «…j
    - «…ˈo
    - …ɡ
    - …vʲ
    - ?.
    - '!.'
    - «…m
    - '!"'
    - …u
    - …ˈɑ
    - "?
    - "».
    - ??
    - ."
    - .ˈju
    - .ˈɪ
    - «…p
    - ".
    - .a
    - .d
    - '!"».'
    - …s
    - …z
    - …j
    - …dʲ
    - «…ˈi
    - .ˈɛ
    - «…ɭʲ
    - «ˈy
    - '!"»'
    - …ɪ
    - «…dʲ
    - .…s
    - …kʲ
    - «…z
    - «…b
    - …m
    - '!!!»'
    - ˈə
    - ……………………………………
    - ?»,
    - "p
    - "m
    - "ɪ
    - »!..
    - …!»
    - …"».
    - '!,'
    - "…»
    - …."
    - ?".
    - …d
    - …ˈo
    - «…u
    - «…d
    - «…f
    - «…t
    - »?!
    - …ˈja
    - «…ˈɛ
    - «…ɡ
    - .s
    - …bʲ
    - «…x
    - «o
    - "?»
    - »!»
    - .?
    - «dʒʲ
    - .ɭʲ
    - «…tʃʲ
    - .ˈo
    - '!!!'
    - «…ˈɪ
    - …ʑ
    - 
    odim: null
    model_conf: {}
    use_preprocessor: true
    token_type: phn
    bpemodel: null
    non_linguistic_symbols: null
    cleaner: null
    g2p: espeak_ng_russian
    feats_extract: fbank
    feats_extract_conf:
        fs: 24000
        fmin: 80
        fmax: 7600
        n_mels: 80
        hop_length: 300
        n_fft: 2048
        win_length: 1200
    normalize: global_mvn
    normalize_conf:
        stats_file: exp/tts_stats_raw_phn_none/train/feats_stats.npz
    tts: transformer
    tts_conf:
        embed_dim: 0
        eprenet_conv_layers: 0
        eprenet_conv_filts: 0
        eprenet_conv_chans: 0
        dprenet_layers: 2
        dprenet_units: 256
        adim: 512
        aheads: 8
        elayers: 6
        eunits: 1024
        dlayers: 6
        dunits: 1024
        positionwise_layer_type: conv1d
        positionwise_conv_kernel_size: 1
        postnet_layers: 5
        postnet_filts: 5
        postnet_chans: 256
        use_masking: true
        bce_pos_weight: 5.0
        use_scaled_pos_enc: true
        encoder_normalize_before: true
        decoder_normalize_before: true
        reduction_factor: 1
        init_type: xavier_uniform
        init_enc_alpha: 1.0
        init_dec_alpha: 1.0
        eprenet_dropout_rate: 0.0
        dprenet_dropout_rate: 0.5
        postnet_dropout_rate: 0.5
        transformer_enc_dropout_rate: 0.1
        transformer_enc_positional_dropout_rate: 0.1
        transformer_enc_attn_dropout_rate: 0.1
        transformer_dec_dropout_rate: 0.1
        transformer_dec_positional_dropout_rate: 0.1
        transformer_dec_attn_dropout_rate: 0.1
        transformer_enc_dec_attn_dropout_rate: 0.1
        use_guided_attn_loss: true
        num_heads_applied_guided_attn: 2
        num_layers_applied_guided_attn: 2
        modules_applied_guided_attn:
        - encoder-decoder
        guided_attn_loss_sigma: 0.4
        guided_attn_loss_lambda: 10.0
    pitch_extract: null
    pitch_extract_conf: {}
    pitch_normalize: null
    pitch_normalize_conf: {}
    energy_extract: null
    energy_extract_conf: {}
    energy_normalize: null
    energy_normalize_conf: {}
    required:
    - output_dir
    - token_list
    version: 0.10.0
    distributed: true

Files

tts_train_transformer_raw_phn_espeak_ng_russian_train.loss.ave.zip

Files (133.6 MB)

Additional details

Related works

Is supplement to
https://github.com/espnet/espnet (URL)