Published October 4, 2022 | Version v1
Other Open

ESPnet2 pretrained model, kan-bayashi/jsut_tts_train_jets_raw_phn_jaconv_pyopenjtalk_prosody_train.total_count.ave_5best, fs=22050, lang=jp

Creators

Description

This model was trained by kan-bayashi using jsut/tts1 recipe in espnet.

 

  • Python API
    See https://github.com/espnet/espnet_model_zoo
  • Evaluate in the recipe
    git clone https://github.com/espnet/espnet
    cd espnet
    git checkout 14fcb2d42b2609f766ffaa7a79e9c921cd8398d9
    pip install -e .
    cd egs2/jsut/tts1
    # Download the model file here
    ./run.sh --skip_data_prep false --skip_train true --download_model kan-bayashi/jsut_tts_train_jets_raw_phn_jaconv_pyopenjtalk_prosody_train.total_count.ave_5best
    
  • Config
    config: conf/tuning/train_jets.yaml
    print_config: false
    log_level: INFO
    dry_run: false
    iterator_type: sequence
    output_dir: exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk_prosody
    ngpu: 1
    seed: 777
    num_workers: 4
    num_att_plot: 3
    dist_backend: nccl
    dist_init_method: env://
    dist_world_size: 4
    dist_rank: 0
    local_rank: 0
    dist_master_addr: localhost
    dist_master_port: 46054
    dist_launcher: null
    multiprocessing_distributed: true
    unused_parameters: true
    sharded_ddp: false
    cudnn_enabled: true
    cudnn_benchmark: false
    cudnn_deterministic: false
    collect_stats: false
    write_collected_feats: false
    max_epoch: 1000
    patience: null
    val_scheduler_criterion:
    - valid
    - loss
    early_stopping_criterion:
    - valid
    - loss
    - min
    best_model_criterion:
    -   - valid
        - text2mel_loss
        - min
    -   - train
        - text2mel_loss
        - min
    -   - train
        - total_count
        - max
    keep_nbest_models: 5
    nbest_averaging_interval: 0
    grad_clip: -1
    grad_clip_type: 2.0
    grad_noise: false
    accum_grad: 1
    no_forward_run: false
    resume: true
    train_dtype: float32
    use_amp: false
    log_interval: 50
    use_matplotlib: true
    use_tensorboard: true
    create_graph_in_tensorboard: false
    use_wandb: false
    wandb_project: null
    wandb_id: null
    wandb_entity: null
    wandb_name: null
    wandb_model_log_interval: -1
    detect_anomaly: false
    pretrain_path: null
    init_param: []
    ignore_init_mismatch: false
    freeze_param: []
    num_iters_per_epoch: 1000
    batch_size: 20
    valid_batch_size: null
    batch_bins: 3000000
    valid_batch_bins: null
    train_shape_file:
    - exp/tts_stats_raw_22.05khz_phn_jaconv_pyopenjtalk_prosody_jets/train/text_shape.phn
    - exp/tts_stats_raw_22.05khz_phn_jaconv_pyopenjtalk_prosody_jets/train/speech_shape
    valid_shape_file:
    - exp/tts_stats_raw_22.05khz_phn_jaconv_pyopenjtalk_prosody_jets/valid/text_shape.phn
    - exp/tts_stats_raw_22.05khz_phn_jaconv_pyopenjtalk_prosody_jets/valid/speech_shape
    batch_type: numel
    valid_batch_type: null
    fold_length:
    - 150
    - 204800
    sort_in_batch: descending
    sort_batch: descending
    multiple_iterator: false
    chunk_length: 500
    chunk_shift_ratio: 0.5
    num_cache_chunks: 1024
    train_data_path_and_name_and_type:
    -   - dump/22k/raw/tr_no_dev/text
        - text
        - text
    -   - dump/22k/raw/tr_no_dev/wav.scp
        - speech
        - sound
    -   - exp/tts_stats_raw_22.05khz_phn_jaconv_pyopenjtalk_prosody_jets/train/collect_feats/pitch.scp
        - pitch
        - npy
    -   - exp/tts_stats_raw_22.05khz_phn_jaconv_pyopenjtalk_prosody_jets/train/collect_feats/energy.scp
        - energy
        - npy
    valid_data_path_and_name_and_type:
    -   - dump/22k/raw/dev/text
        - text
        - text
    -   - dump/22k/raw/dev/wav.scp
        - speech
        - sound
    -   - exp/tts_stats_raw_22.05khz_phn_jaconv_pyopenjtalk_prosody_jets/valid/collect_feats/pitch.scp
        - pitch
        - npy
    -   - exp/tts_stats_raw_22.05khz_phn_jaconv_pyopenjtalk_prosody_jets/valid/collect_feats/energy.scp
        - energy
        - npy
    allow_variable_data_keys: false
    max_cache_size: 0.0
    max_cache_fd: 32
    valid_max_cache_size: null
    optim: adamw
    optim_conf:
        lr: 0.0002
        betas:
        - 0.8
        - 0.99
        eps: 1.0e-09
        weight_decay: 0.0
    scheduler: exponentiallr
    scheduler_conf:
        gamma: 0.999875
    optim2: adamw
    optim2_conf:
        lr: 0.0002
        betas:
        - 0.8
        - 0.99
        eps: 1.0e-09
        weight_decay: 0.0
    scheduler2: exponentiallr
    scheduler2_conf:
        gamma: 0.999875
    generator_first: true
    token_list:
    - 
    - 
    - a
    - o
    - i
    - '['
    - '#'
    - u
    - ']'
    - e
    - k
    - n
    - t
    - r
    - s
    - N
    - m
    - _
    - sh
    - d
    - g
    - ^
    - $
    - w
    - cl
    - h
    - y
    - b
    - j
    - ts
    - ch
    - z
    - p
    - f
    - ky
    - ry
    - gy
    - hy
    - ny
    - by
    - my
    - py
    - v
    - dy
    - '?'
    - ty
    - 
    odim: null
    model_conf: {}
    use_preprocessor: true
    token_type: phn
    bpemodel: null
    non_linguistic_symbols: null
    cleaner: jaconv
    g2p: pyopenjtalk_prosody
    feats_extract: fbank
    feats_extract_conf:
        n_fft: 1024
        hop_length: 256
        win_length: null
        fs: 22050
        fmin: 80
        fmax: 7600
        n_mels: 80
    normalize: global_mvn
    normalize_conf:
        stats_file: exp/tts_stats_raw_22.05khz_phn_jaconv_pyopenjtalk_prosody_jets/train/feats_stats.npz
    tts: jets
    tts_conf:
        generator_type: jets_generator
        generator_params:
            adim: 256
            aheads: 2
            elayers: 4
            eunits: 1024
            dlayers: 4
            dunits: 1024
            positionwise_layer_type: conv1d
            positionwise_conv_kernel_size: 3
            duration_predictor_layers: 2
            duration_predictor_chans: 256
            duration_predictor_kernel_size: 3
            use_masking: true
            encoder_normalize_before: true
            decoder_normalize_before: true
            encoder_type: transformer
            decoder_type: transformer
            conformer_rel_pos_type: latest
            conformer_pos_enc_layer_type: rel_pos
            conformer_self_attn_layer_type: rel_selfattn
            conformer_activation_type: swish
            use_macaron_style_in_conformer: true
            use_cnn_in_conformer: true
            conformer_enc_kernel_size: 7
            conformer_dec_kernel_size: 31
            init_type: xavier_uniform
            transformer_enc_dropout_rate: 0.2
            transformer_enc_positional_dropout_rate: 0.2
            transformer_enc_attn_dropout_rate: 0.2
            transformer_dec_dropout_rate: 0.2
            transformer_dec_positional_dropout_rate: 0.2
            transformer_dec_attn_dropout_rate: 0.2
            pitch_predictor_layers: 5
            pitch_predictor_chans: 256
            pitch_predictor_kernel_size: 5
            pitch_predictor_dropout: 0.5
            pitch_embed_kernel_size: 1
            pitch_embed_dropout: 0.0
            stop_gradient_from_pitch_predictor: true
            energy_predictor_layers: 2
            energy_predictor_chans: 256
            energy_predictor_kernel_size: 3
            energy_predictor_dropout: 0.5
            energy_embed_kernel_size: 1
            energy_embed_dropout: 0.0
            stop_gradient_from_energy_predictor: false
            generator_out_channels: 1
            generator_channels: 512
            generator_global_channels: -1
            generator_kernel_size: 7
            generator_upsample_scales:
            - 8
            - 8
            - 2
            - 2
            generator_upsample_kernel_sizes:
            - 16
            - 16
            - 4
            - 4
            generator_resblock_kernel_sizes:
            - 3
            - 7
            - 11
            generator_resblock_dilations:
            -   - 1
                - 3
                - 5
            -   - 1
                - 3
                - 5
            -   - 1
                - 3
                - 5
            generator_use_additional_convs: true
            generator_bias: true
            generator_nonlinear_activation: LeakyReLU
            generator_nonlinear_activation_params:
                negative_slope: 0.1
            generator_use_weight_norm: true
            segment_size: 64
            idim: 47
            odim: 80
        discriminator_type: hifigan_multi_scale_multi_period_discriminator
        discriminator_params:
            scales: 1
            scale_downsample_pooling: AvgPool1d
            scale_downsample_pooling_params:
                kernel_size: 4
                stride: 2
                padding: 2
            scale_discriminator_params:
                in_channels: 1
                out_channels: 1
                kernel_sizes:
                - 15
                - 41
                - 5
                - 3
                channels: 128
                max_downsample_channels: 1024
                max_groups: 16
                bias: true
                downsample_scales:
                - 2
                - 2
                - 4
                - 4
                - 1
                nonlinear_activation: LeakyReLU
                nonlinear_activation_params:
                    negative_slope: 0.1
                use_weight_norm: true
                use_spectral_norm: false
            follow_official_norm: false
            periods:
            - 2
            - 3
            - 5
            - 7
            - 11
            period_discriminator_params:
                in_channels: 1
                out_channels: 1
                kernel_sizes:
                - 5
                - 3
                channels: 32
                downsample_scales:
                - 3
                - 3
                - 3
                - 3
                - 1
                max_downsample_channels: 1024
                bias: true
                nonlinear_activation: LeakyReLU
                nonlinear_activation_params:
                    negative_slope: 0.1
                use_weight_norm: true
                use_spectral_norm: false
        generator_adv_loss_params:
            average_by_discriminators: false
            loss_type: mse
        discriminator_adv_loss_params:
            average_by_discriminators: false
            loss_type: mse
        feat_match_loss_params:
            average_by_discriminators: false
            average_by_layers: false
            include_final_outputs: true
        mel_loss_params:
            fs: 22050
            n_fft: 1024
            hop_length: 256
            win_length: null
            window: hann
            n_mels: 80
            fmin: 0
            fmax: null
            log_base: null
        lambda_adv: 1.0
        lambda_mel: 45.0
        lambda_feat_match: 2.0
        lambda_var: 1.0
        lambda_align: 2.0
        sampling_rate: 22050
        cache_generator_outputs: true
    pitch_extract: dio
    pitch_extract_conf:
        reduction_factor: 1
        use_token_averaged_f0: false
        fs: 22050
        n_fft: 1024
        hop_length: 256
        f0max: 400
        f0min: 80
    pitch_normalize: global_mvn
    pitch_normalize_conf:
        stats_file: exp/tts_stats_raw_22.05khz_phn_jaconv_pyopenjtalk_prosody_jets/train/pitch_stats.npz
    energy_extract: energy
    energy_extract_conf:
        reduction_factor: 1
        use_token_averaged_energy: false
        fs: 22050
        n_fft: 1024
        hop_length: 256
        win_length: null
    energy_normalize: global_mvn
    energy_normalize_conf:
        stats_file: exp/tts_stats_raw_22.05khz_phn_jaconv_pyopenjtalk_prosody_jets/train/energy_stats.npz
    required:
    - output_dir
    - token_list
    version: '202205'
    distributed: true

Files

tts_train_jets_raw_phn_jaconv_pyopenjtalk_prosody_train.total_count.ave_5best.zip

Additional details

Related works

Is supplement to
https://github.com/espnet/espnet (URL)