{ "access": { "embargo": { "active": false, "reason": null }, "files": "public", "record": "public", "status": "open" }, "created": "2020-09-19T01:23:15.214000+00:00", "custom_fields": {}, "deletion_status": { "is_deleted": false, "status": "P" }, "files": { "count": 1, "enabled": true, "entries": { "tts_train_conformer_fastspeech2_raw_phn_tacotron_g2p_en_no_space_train.loss.ave.zip": { "checksum": "md5:f3ea800cb20449ba6cfef630a8da2994", "ext": "zip", "id": "ff848065-489e-4489-9a7c-e922757a4cfd", "key": "tts_train_conformer_fastspeech2_raw_phn_tacotron_g2p_en_no_space_train.loss.ave.zip", "metadata": null, "mimetype": "application/zip", "size": 281785671 } }, "order": [], "total_bytes": 281785671 }, "id": "4036268", "is_draft": false, "is_published": true, "links": { "access": "https://zenodo.org/api/records/4036268/access", "access_links": "https://zenodo.org/api/records/4036268/access/links", "access_request": "https://zenodo.org/api/records/4036268/access/request", "access_users": "https://zenodo.org/api/records/4036268/access/users", "archive": "https://zenodo.org/api/records/4036268/files-archive", "archive_media": "https://zenodo.org/api/records/4036268/media-files-archive", "communities": "https://zenodo.org/api/records/4036268/communities", "communities-suggestions": "https://zenodo.org/api/records/4036268/communities-suggestions", "doi": "https://doi.org/10.5281/zenodo.4036268", "draft": "https://zenodo.org/api/records/4036268/draft", "files": "https://zenodo.org/api/records/4036268/files", "latest": "https://zenodo.org/api/records/4036268/versions/latest", "latest_html": "https://zenodo.org/records/4036268/latest", "media_files": "https://zenodo.org/api/records/4036268/media-files", "parent": "https://zenodo.org/api/records/4036267", "parent_doi": "https://zenodo.org/doi/10.5281/zenodo.4036267", "parent_html": "https://zenodo.org/records/4036267", "requests": "https://zenodo.org/api/records/4036268/requests", "reserve_doi": "https://zenodo.org/api/records/4036268/draft/pids/doi", "self": "https://zenodo.org/api/records/4036268", "self_doi": "https://zenodo.org/doi/10.5281/zenodo.4036268", "self_html": "https://zenodo.org/records/4036268", "self_iiif_manifest": "https://zenodo.org/api/iiif/record:4036268/manifest", "self_iiif_sequence": "https://zenodo.org/api/iiif/record:4036268/sequence/default", "versions": "https://zenodo.org/api/records/4036268/versions" }, "media_files": { "count": 0, "enabled": false, "entries": {}, "order": [], "total_bytes": 0 }, "metadata": { "creators": [ { "person_or_org": { "family_name": "kan-bayashi", "name": "kan-bayashi", "type": "personal" } } ], "description": "This model was trained by kan-bayashi using ljspeech/tts1 recipe in espnet.\n
\n
See https://github.com/espnet/espnet_model_zoo
git clone https://github.com/espnet/espnet\ncd espnet\ngit checkout 322a5cf47d7216517e02addc0a67a510b729b056\npip install -e .\ncd egs2/ljspeech/tts1\n# Download the model file here\n./run.sh --skip_data_prep false --skip_train true --download_model kan-bayashi/ljspeech_tts_train_conformer_fastspeech2_raw_phn_tacotron_g2p_en_no_space_train.loss.ave
\n
config: conf/tuning/train_conformer_fastspeech2.yaml\nprint_config: false\nlog_level: INFO\ndry_run: false\niterator_type: sequence\noutput_dir: exp/tts_train_conformer_fastspeech2_raw_phn_tacotron_g2p_en_no_space\nngpu: 1\nseed: 0\nnum_workers: 1\nnum_att_plot: 3\ndist_backend: nccl\ndist_init_method: env://\ndist_world_size: null\ndist_rank: null\nlocal_rank: 0\ndist_master_addr: null\ndist_master_port: null\ndist_launcher: null\nmultiprocessing_distributed: false\ncudnn_enabled: true\ncudnn_benchmark: false\ncudnn_deterministic: true\ncollect_stats: false\nwrite_collected_feats: false\nmax_epoch: 1000\npatience: null\nval_scheduler_criterion:\n- valid\n- loss\nearly_stopping_criterion:\n- valid\n- loss\n- min\nbest_model_criterion:\n- - valid\n - loss\n - min\n- - train\n - loss\n - min\nkeep_nbest_models: 5\ngrad_clip: 1.0\ngrad_clip_type: 2.0\ngrad_noise: false\naccum_grad: 10\nno_forward_run: false\nresume: true\ntrain_dtype: float32\nuse_amp: false\nlog_interval: null\npretrain_path: []\npretrain_key: []\nnum_iters_per_epoch: 500\nbatch_size: 20\nvalid_batch_size: null\nbatch_bins: 2400000\nvalid_batch_bins: null\ntrain_shape_file:\n- exp/tts_train_tacotron2_raw_phn_tacotron_g2p_en_no_space/decode_tacotron2_teacher_forcing_train.loss.best/stats/train/text_shape.phn\n- exp/tts_train_tacotron2_raw_phn_tacotron_g2p_en_no_space/decode_tacotron2_teacher_forcing_train.loss.best/stats/train/speech_shape\nvalid_shape_file:\n- exp/tts_train_tacotron2_raw_phn_tacotron_g2p_en_no_space/decode_tacotron2_teacher_forcing_train.loss.best/stats/valid/text_shape.phn\n- exp/tts_train_tacotron2_raw_phn_tacotron_g2p_en_no_space/decode_tacotron2_teacher_forcing_train.loss.best/stats/valid/speech_shape\nbatch_type: numel\nvalid_batch_type: null\nfold_length:\n- 150\n- 204800\nsort_in_batch: descending\nsort_batch: descending\nmultiple_iterator: false\nchunk_length: 500\nchunk_shift_ratio: 0.5\nnum_cache_chunks: 1024\ntrain_data_path_and_name_and_type:\n- - dump/raw/tr_no_dev/text\n - text\n - text\n- - exp/tts_train_tacotron2_raw_phn_tacotron_g2p_en_no_space/decode_tacotron2_teacher_forcing_train.loss.best/tr_no_dev/durations\n - durations\n - text_int\n- - dump/raw/tr_no_dev/wav.scp\n - speech\n - sound\n- - exp/tts_train_tacotron2_raw_phn_tacotron_g2p_en_no_space/decode_tacotron2_teacher_forcing_train.loss.best/stats/train/collect_feats/pitch.scp\n - pitch\n - npy\n- - exp/tts_train_tacotron2_raw_phn_tacotron_g2p_en_no_space/decode_tacotron2_teacher_forcing_train.loss.best/stats/train/collect_feats/energy.scp\n - energy\n - npy\nvalid_data_path_and_name_and_type:\n- - dump/raw/dev/text\n - text\n - text\n- - exp/tts_train_tacotron2_raw_phn_tacotron_g2p_en_no_space/decode_tacotron2_teacher_forcing_train.loss.best/dev/durations\n - durations\n - text_int\n- - dump/raw/dev/wav.scp\n - speech\n - sound\n- - exp/tts_train_tacotron2_raw_phn_tacotron_g2p_en_no_space/decode_tacotron2_teacher_forcing_train.loss.best/stats/valid/collect_feats/pitch.scp\n - pitch\n - npy\n- - exp/tts_train_tacotron2_raw_phn_tacotron_g2p_en_no_space/decode_tacotron2_teacher_forcing_train.loss.best/stats/valid/collect_feats/energy.scp\n - energy\n - npy\nallow_variable_data_keys: false\nmax_cache_size: 0.0\nvalid_max_cache_size: null\noptim: adam\noptim_conf:\n lr: 1.0\nscheduler: noamlr\nscheduler_conf:\n model_size: 384\n warmup_steps: 4000\ntoken_list:\n- \n- \n- ..\n- OY0\n- UH0\n- AW0\n- '!'\n- OY2\n- '?'\n- UH2\n- ER2\n- ''''\n- AA0\n- IY2\n- AW2\n- AY0\n- AH2\n- UW2\n- AE0\n- OW2\n- ZH\n- AO2\n- EY0\n- OY1\n- EH0\n- UW0\n- AA2\n- AY2\n- AE2\n- IH2\n- AO0\n- EY2\n- OW0\n- EH2\n- UH1\n- TH\n- AW1\n- Y\n- JH\n- CH\n- ER1\n- G\n- NG\n- SH\n- OW1\n- .\n- AY1\n- EY1\n- AO1\n- IY0\n- UW1\n- IY1\n- HH\n- B\n- AA1\n- ','\n- F\n- ER0\n- V\n- AH1\n- AE1\n- P\n- W\n- EH1\n- M\n- IH0\n- IH1\n- Z\n- K\n- DH\n- L\n- R\n- S\n- D\n- T\n- N\n- AH0\n- \nodim: null\nmodel_conf: {}\nuse_preprocessor: true\ntoken_type: phn\nbpemodel: null\nnon_linguistic_symbols: null\ncleaner: tacotron\ng2p: g2p_en_no_space\nfeats_extract: fbank\nfeats_extract_conf:\n fs: 22050\n fmin: 80\n fmax: 7600\n n_mels: 80\n hop_length: 256\n n_fft: 1024\n win_length: null\nnormalize: global_mvn\nnormalize_conf:\n stats_file: exp/tts_train_tacotron2_raw_phn_tacotron_g2p_en_no_space/decode_tacotron2_teacher_forcing_train.loss.best/stats/train/feats_stats.npz\ntts: fastspeech2\ntts_conf:\n adim: 384\n aheads: 2\n elayers: 4\n eunits: 1536\n dlayers: 4\n dunits: 1536\n positionwise_layer_type: conv1d\n positionwise_conv_kernel_size: 3\n duration_predictor_layers: 2\n duration_predictor_chans: 256\n duration_predictor_kernel_size: 3\n postnet_layers: 5\n postnet_filts: 5\n postnet_chans: 256\n use_masking: true\n encoder_normalize_before: false\n decoder_normalize_before: false\n reduction_factor: 1\n encoder_type: conformer\n decoder_type: conformer\n conformer_pos_enc_layer_type: rel_pos\n conformer_self_attn_layer_type: rel_selfattn\n conformer_activation_type: swish\n use_macaron_style_in_conformer: true\n use_cnn_in_conformer: true\n conformer_enc_kernel_size: 7\n conformer_dec_kernel_size: 31\n init_type: xavier_uniform\n transformer_enc_dropout_rate: 0.2\n transformer_enc_positional_dropout_rate: 0.2\n transformer_enc_attn_dropout_rate: 0.2\n transformer_dec_dropout_rate: 0.2\n transformer_dec_positional_dropout_rate: 0.2\n transformer_dec_attn_dropout_rate: 0.2\n pitch_predictor_layers: 5\n pitch_predictor_chans: 256\n pitch_predictor_kernel_size: 5\n pitch_predictor_dropout: 0.5\n pitch_embed_kernel_size: 1\n pitch_embed_dropout: 0.0\n stop_gradient_from_pitch_predictor: true\n energy_predictor_layers: 2\n energy_predictor_chans: 256\n energy_predictor_kernel_size: 3\n energy_predictor_dropout: 0.5\n energy_embed_kernel_size: 1\n energy_embed_dropout: 0.0\n stop_gradient_from_energy_predictor: false\npitch_extract: dio\npitch_extract_conf:\n fs: 22050\n n_fft: 1024\n hop_length: 256\n f0max: 400\n f0min: 80\npitch_normalize: global_mvn\npitch_normalize_conf:\n stats_file: exp/tts_train_tacotron2_raw_phn_tacotron_g2p_en_no_space/decode_tacotron2_teacher_forcing_train.loss.best/stats/train/pitch_stats.npz\nenergy_extract: energy\nenergy_extract_conf:\n fs: 22050\n n_fft: 1024\n hop_length: 256\n win_length: null\nenergy_normalize: global_mvn\nenergy_normalize_conf:\n stats_file: exp/tts_train_tacotron2_raw_phn_tacotron_g2p_en_no_space/decode_tacotron2_teacher_forcing_train.loss.best/stats/train/energy_stats.npz\nrequired:\n- output_dir\n- token_list\ndistributed: false