Published August 15, 2020 | Version v1
Other Open

ESPnet2 pretrained model, kan-bayashi/csmsc_tts_train_fastspeech_raw_phn_pypinyin_g2p_phone_train.loss.best, fs=24000, lang=zh

Creators

Description

This model was trained by kan-bayashi using csmsc/tts1 recipe in espnet.

 

  • Python API
    See https://github.com/espnet/espnet_model_zoo
  • Evaluate in the recipe
    git clone https://github.com/espnet/espnet
    cd espnet
    git checkout b4413f6259c49d2543db1e10417c08118a09d990
    pip install -e .
    cd egs2/csmsc/tts1
    # Download the model file here
    ./run.sh --skip_data_prep false --skip_train true --download_model kan-bayashi/csmsc_tts_train_fastspeech_raw_phn_pypinyin_g2p_phone_train.loss.best
    
  • Config
    config: conf/tuning/train_fastspeech.yaml
    print_config: false
    log_level: INFO
    dry_run: false
    iterator_type: sequence
    output_dir: exp/tts_train_fastspeech_raw_phn_pypinyin_g2p_phone
    ngpu: 1
    seed: 0
    num_workers: 1
    num_att_plot: 3
    dist_backend: nccl
    dist_init_method: env://
    dist_world_size: null
    dist_rank: null
    local_rank: 0
    dist_master_addr: null
    dist_master_port: null
    dist_launcher: null
    multiprocessing_distributed: false
    cudnn_enabled: true
    cudnn_benchmark: false
    cudnn_deterministic: true
    collect_stats: false
    write_collected_feats: false
    max_epoch: 1000
    patience: null
    val_scheduler_criterion:
    - valid
    - loss
    early_stopping_criterion:
    - valid
    - loss
    - min
    best_model_criterion:
    -   - valid
        - loss
        - min
    -   - train
        - loss
        - min
    keep_nbest_models: 5
    grad_clip: 1.0
    grad_noise: false
    accum_grad: 6
    no_forward_run: false
    resume: true
    train_dtype: float32
    log_interval: null
    pretrain_path: []
    pretrain_key: []
    num_iters_per_epoch: null
    batch_size: 20
    valid_batch_size: null
    batch_bins: 800000
    valid_batch_bins: null
    train_shape_file:
    - exp/tts_stats_raw_phn_pypinyin_g2p_phone/train/text_shape.phn
    - exp/tts_train_tacotron2_raw_phn_pypinyin_g2p_phone/decode_train.loss.best/tr_no_dev/speech_shape
    valid_shape_file:
    - exp/tts_stats_raw_phn_pypinyin_g2p_phone/valid/text_shape.phn
    - exp/tts_train_tacotron2_raw_phn_pypinyin_g2p_phone/decode_train.loss.best/dev/speech_shape
    batch_type: numel
    valid_batch_type: null
    fold_length:
    - 150
    - 800
    sort_in_batch: descending
    sort_batch: descending
    multiple_iterator: false
    chunk_length: 500
    chunk_shift_ratio: 0.5
    num_cache_chunks: 1024
    train_data_path_and_name_and_type:
    -   - dump/raw/tr_no_dev/text
        - text
        - text
    -   - exp/tts_train_tacotron2_raw_phn_pypinyin_g2p_phone/decode_train.loss.best/tr_no_dev/durations
        - durations
        - text_int
    -   - exp/tts_train_tacotron2_raw_phn_pypinyin_g2p_phone/decode_train.loss.best/tr_no_dev/denorm/feats.scp
        - speech
        - npy
    -   - exp/tts_stats_raw_phn_pypinyin_g2p_phone/train/collect_feats/pitch.scp
        - pitch
        - npy
    -   - exp/tts_stats_raw_phn_pypinyin_g2p_phone/train/collect_feats/energy.scp
        - energy
        - npy
    valid_data_path_and_name_and_type:
    -   - dump/raw/dev/text
        - text
        - text
    -   - exp/tts_train_tacotron2_raw_phn_pypinyin_g2p_phone/decode_train.loss.best/dev/durations
        - durations
        - text_int
    -   - exp/tts_train_tacotron2_raw_phn_pypinyin_g2p_phone/decode_train.loss.best/dev/denorm/feats.scp
        - speech
        - npy
    -   - exp/tts_stats_raw_phn_pypinyin_g2p_phone/valid/collect_feats/pitch.scp
        - pitch
        - npy
    -   - exp/tts_stats_raw_phn_pypinyin_g2p_phone/valid/collect_feats/energy.scp
        - energy
        - npy
    allow_variable_data_keys: false
    max_cache_size: 0.0
    valid_max_cache_size: null
    optim: adam
    optim_conf:
        lr: 1.0
    scheduler: noamlr
    scheduler_conf:
        model_size: 384
        warmup_steps: 4000
    token_list:
    - 
    - 
    - "\uFF30"
    - "\uFF22"
    - "\xFC"
    - an
    - ueng3
    - '2'
    - uen
    - ei
    - ua
    - ao
    - u
    - ueng4
    - uo
    - ang
    - ou
    - v2
    - ueng1
    - o
    - io1
    - "\xFCn3"
    - er
    - ve4
    - o3
    - uai2
    - uen3
    - uen1
    - uai3
    - "\xFCe3"
    - iou1
    - iong2
    - ia2
    - uai1
    - iong1
    - "\xFCan1"
    - "\xFCe1"
    - v4
    - ua3
    - ia
    - iong3
    - uei3
    - ua2
    - ia3
    - uei1
    - o1
    - o4
    - "\xFCn2"
    - un2
    - er3
    - "\xFCn1"
    - uen4
    - un3
    - iu1
    - "\xFCn4"
    - uen2
    - "\xFCan3"
    - un4
    - "\xFCan4"
    - iu3
    - ua1
    - uei2
    - "\uFF01"
    - iou4
    - iou2
    - er4
    - o2
    - ei1
    - iao2
    - uang4
    - "\xFC1"
    - ui2
    - v3
    - uang2
    - iong4
    - un1
    - ui1
    - ua4
    - ao2
    - en
    - a
    - iu2
    - uang1
    - uang3
    - "\xFCe2"
    - in3
    - "\uFF1F"
    - uai4
    - "\xFCe4"
    - uan2
    - ou2
    - eng3
    - ui3
    - uan4
    - a2
    - ie2
    - ong3
    - iang2
    - ie1
    - in4
    - iao1
    - e1
    - in2
    - en4
    - uan3
    - "\xFC2"
    - ing3
    - i
    - ei2
    - ei3
    - iang1
    - er2
    - ia4
    - uo2
    - "\xFC3"
    - uan1
    - ia1
    - e3
    - ong4
    - ie4
    - ai1
    - en3
    - iang3
    - eng4
    - iang4
    - ao1
    - ou1
    - ang2
    - ai3
    - iu4
    - "\xFCan2"
    - ang3
    - en1
    - ong2
    - uei4
    - ei4
    - iao3
    - "\xFC4"
    - an2
    - ing4
    - an3
    - a3
    - ie3
    - an1
    - ian3
    - uo1
    - ing1
    - ou4
    - ian1
    - ou3
    - eng1
    - ang1
    - in1
    - a4
    - eng2
    - uo4
    - u1
    - ang4
    - iou3
    - iao4
    - ian2
    - u2
    - ui4
    - e2
    - en2
    - u3
    - ing2
    - ao4
    - ong1
    - an4
    - ai2
    - ao3
    - uo3
    - ian4
    - p
    - c
    - a1
    - ai4
    - e4
    - s
    - k
    - r
    - i2
    - f
    - n
    - u4
    - ch
    - i3
    - i1
    - q
    - z
    - m
    - t
    - g
    - b
    - e
    - h
    - i4
    - x
    - "\uFF0C"
    - zh
    - "\u3002"
    - l
    - j
    - sh
    - d
    - 
    odim: 80
    model_conf: {}
    use_preprocessor: true
    token_type: phn
    bpemodel: null
    non_linguistic_symbols: null
    cleaner: null
    g2p: pypinyin_g2p_phone
    feats_extract: null
    feats_extract_conf: null
    normalize: global_mvn
    normalize_conf:
        stats_file: exp/tts_stats_raw_phn_pypinyin_g2p_phone/train/feats_stats.npz
    tts: fastspeech
    tts_conf:
        adim: 384
        aheads: 2
        elayers: 6
        eunits: 1536
        dlayers: 6
        dunits: 1536
        positionwise_layer_type: conv1d
        positionwise_conv_kernel_size: 3
        duration_predictor_layers: 2
        duration_predictor_chans: 384
        duration_predictor_kernel_size: 3
        postnet_layers: 5
        postnet_filts: 5
        postnet_chans: 256
        use_masking: true
        use_scaled_pos_enc: true
        encoder_normalize_before: false
        decoder_normalize_before: false
        reduction_factor: 1
        init_type: xavier_uniform
        init_enc_alpha: 1.0
        init_dec_alpha: 1.0
        transformer_enc_dropout_rate: 0.1
        transformer_enc_positional_dropout_rate: 0.1
        transformer_enc_attn_dropout_rate: 0.1
        transformer_dec_dropout_rate: 0.1
        transformer_dec_positional_dropout_rate: 0.1
        transformer_dec_attn_dropout_rate: 0.1
    pitch_extract: null
    pitch_extract_conf:
        fs: 24000
        n_fft: 2048
        hop_length: 300
        f0max: 400
        f0min: 80
    pitch_normalize: null
    pitch_normalize_conf:
        stats_file: exp/tts_stats_raw_phn_pypinyin_g2p_phone/train/pitch_stats.npz
    energy_extract: null
    energy_extract_conf:
        fs: 24000
        n_fft: 2048
        hop_length: 300
        win_length: 1200
    energy_normalize: null
    energy_normalize_conf:
        stats_file: exp/tts_stats_raw_phn_pypinyin_g2p_phone/train/energy_stats.npz
    required:
    - output_dir
    - token_list
    distributed: false

Files

tts_train_fastspeech_raw_phn_pypinyin_g2p_phone_train.loss.best.zip

Files (207.2 MB)

Additional details

Related works

Is supplement to
https://github.com/espnet/espnet (URL)