{ "access": { "embargo": { "active": false, "reason": null }, "files": "public", "record": "public", "status": "open" }, "created": "2020-08-15T02:41:21.958947+00:00", "custom_fields": {}, "deletion_status": { "is_deleted": false, "status": "P" }, "files": { "count": 1, "enabled": true, "entries": { "tts_train_fastspeech_raw_phn_pypinyin_g2p_phone_train.loss.best.zip": { "checksum": "md5:7b15aaa6890a7dfeba4edc8ac0e9c963", "ext": "zip", "id": "ecd1e478-e725-449e-a974-12696fa5e854", "key": "tts_train_fastspeech_raw_phn_pypinyin_g2p_phone_train.loss.best.zip", "metadata": null, "mimetype": "application/zip", "size": 207245491 } }, "order": [], "total_bytes": 207245491 }, "id": "3986227", "is_draft": false, "is_published": true, "links": { "access": "https://zenodo.org/api/records/3986227/access", "access_links": "https://zenodo.org/api/records/3986227/access/links", "access_request": "https://zenodo.org/api/records/3986227/access/request", "access_users": "https://zenodo.org/api/records/3986227/access/users", "archive": "https://zenodo.org/api/records/3986227/files-archive", "archive_media": "https://zenodo.org/api/records/3986227/media-files-archive", "communities": "https://zenodo.org/api/records/3986227/communities", "communities-suggestions": "https://zenodo.org/api/records/3986227/communities-suggestions", "doi": "https://doi.org/10.5281/zenodo.3986227", "draft": "https://zenodo.org/api/records/3986227/draft", "files": "https://zenodo.org/api/records/3986227/files", "latest": "https://zenodo.org/api/records/3986227/versions/latest", "latest_html": "https://zenodo.org/records/3986227/latest", "media_files": "https://zenodo.org/api/records/3986227/media-files", "parent": "https://zenodo.org/api/records/3986226", "parent_doi": "https://zenodo.org/doi/10.5281/zenodo.3986226", "parent_html": "https://zenodo.org/records/3986226", "requests": "https://zenodo.org/api/records/3986227/requests", "reserve_doi": "https://zenodo.org/api/records/3986227/draft/pids/doi", "self": "https://zenodo.org/api/records/3986227", "self_doi": "https://zenodo.org/doi/10.5281/zenodo.3986227", "self_html": "https://zenodo.org/records/3986227", "self_iiif_manifest": "https://zenodo.org/api/iiif/record:3986227/manifest", "self_iiif_sequence": "https://zenodo.org/api/iiif/record:3986227/sequence/default", "versions": "https://zenodo.org/api/records/3986227/versions" }, "media_files": { "count": 0, "enabled": false, "entries": {}, "order": [], "total_bytes": 0 }, "metadata": { "creators": [ { "person_or_org": { "family_name": "kan-bayashi", "name": "kan-bayashi", "type": "personal" } } ], "description": "
This model was trained by kan-bayashi using csmsc/tts1 recipe in espnet.
\n\n\n\n
See https://github.com/espnet/espnet_model_zoo
\n\tgit clone https://github.com/espnet/espnet\ncd espnet\ngit checkout b4413f6259c49d2543db1e10417c08118a09d990\npip install -e .\ncd egs2/csmsc/tts1\n# Download the model file here\n./run.sh --skip_data_prep false --skip_train true --download_model kan-bayashi/csmsc_tts_train_fastspeech_raw_phn_pypinyin_g2p_phone_train.loss.best
\n
\n\tconfig: conf/tuning/train_fastspeech.yaml\nprint_config: false\nlog_level: INFO\ndry_run: false\niterator_type: sequence\noutput_dir: exp/tts_train_fastspeech_raw_phn_pypinyin_g2p_phone\nngpu: 1\nseed: 0\nnum_workers: 1\nnum_att_plot: 3\ndist_backend: nccl\ndist_init_method: env://\ndist_world_size: null\ndist_rank: null\nlocal_rank: 0\ndist_master_addr: null\ndist_master_port: null\ndist_launcher: null\nmultiprocessing_distributed: false\ncudnn_enabled: true\ncudnn_benchmark: false\ncudnn_deterministic: true\ncollect_stats: false\nwrite_collected_feats: false\nmax_epoch: 1000\npatience: null\nval_scheduler_criterion:\n- valid\n- loss\nearly_stopping_criterion:\n- valid\n- loss\n- min\nbest_model_criterion:\n- - valid\n - loss\n - min\n- - train\n - loss\n - min\nkeep_nbest_models: 5\ngrad_clip: 1.0\ngrad_noise: false\naccum_grad: 6\nno_forward_run: false\nresume: true\ntrain_dtype: float32\nlog_interval: null\npretrain_path: []\npretrain_key: []\nnum_iters_per_epoch: null\nbatch_size: 20\nvalid_batch_size: null\nbatch_bins: 800000\nvalid_batch_bins: null\ntrain_shape_file:\n- exp/tts_stats_raw_phn_pypinyin_g2p_phone/train/text_shape.phn\n- exp/tts_train_tacotron2_raw_phn_pypinyin_g2p_phone/decode_train.loss.best/tr_no_dev/speech_shape\nvalid_shape_file:\n- exp/tts_stats_raw_phn_pypinyin_g2p_phone/valid/text_shape.phn\n- exp/tts_train_tacotron2_raw_phn_pypinyin_g2p_phone/decode_train.loss.best/dev/speech_shape\nbatch_type: numel\nvalid_batch_type: null\nfold_length:\n- 150\n- 800\nsort_in_batch: descending\nsort_batch: descending\nmultiple_iterator: false\nchunk_length: 500\nchunk_shift_ratio: 0.5\nnum_cache_chunks: 1024\ntrain_data_path_and_name_and_type:\n- - dump/raw/tr_no_dev/text\n - text\n - text\n- - exp/tts_train_tacotron2_raw_phn_pypinyin_g2p_phone/decode_train.loss.best/tr_no_dev/durations\n - durations\n - text_int\n- - exp/tts_train_tacotron2_raw_phn_pypinyin_g2p_phone/decode_train.loss.best/tr_no_dev/denorm/feats.scp\n - speech\n - npy\n- - exp/tts_stats_raw_phn_pypinyin_g2p_phone/train/collect_feats/pitch.scp\n - pitch\n - npy\n- - exp/tts_stats_raw_phn_pypinyin_g2p_phone/train/collect_feats/energy.scp\n - energy\n - npy\nvalid_data_path_and_name_and_type:\n- - dump/raw/dev/text\n - text\n - text\n- - exp/tts_train_tacotron2_raw_phn_pypinyin_g2p_phone/decode_train.loss.best/dev/durations\n - durations\n - text_int\n- - exp/tts_train_tacotron2_raw_phn_pypinyin_g2p_phone/decode_train.loss.best/dev/denorm/feats.scp\n - speech\n - npy\n- - exp/tts_stats_raw_phn_pypinyin_g2p_phone/valid/collect_feats/pitch.scp\n - pitch\n - npy\n- - exp/tts_stats_raw_phn_pypinyin_g2p_phone/valid/collect_feats/energy.scp\n - energy\n - npy\nallow_variable_data_keys: false\nmax_cache_size: 0.0\nvalid_max_cache_size: null\noptim: adam\noptim_conf:\n lr: 1.0\nscheduler: noamlr\nscheduler_conf:\n model_size: 384\n warmup_steps: 4000\ntoken_list:\n- \n- \n- \"\\uFF30\"\n- \"\\uFF22\"\n- \"\\xFC\"\n- an\n- ueng3\n- '2'\n- uen\n- ei\n- ua\n- ao\n- u\n- ueng4\n- uo\n- ang\n- ou\n- v2\n- ueng1\n- o\n- io1\n- \"\\xFCn3\"\n- er\n- ve4\n- o3\n- uai2\n- uen3\n- uen1\n- uai3\n- \"\\xFCe3\"\n- iou1\n- iong2\n- ia2\n- uai1\n- iong1\n- \"\\xFCan1\"\n- \"\\xFCe1\"\n- v4\n- ua3\n- ia\n- iong3\n- uei3\n- ua2\n- ia3\n- uei1\n- o1\n- o4\n- \"\\xFCn2\"\n- un2\n- er3\n- \"\\xFCn1\"\n- uen4\n- un3\n- iu1\n- \"\\xFCn4\"\n- uen2\n- \"\\xFCan3\"\n- un4\n- \"\\xFCan4\"\n- iu3\n- ua1\n- uei2\n- \"\\uFF01\"\n- iou4\n- iou2\n- er4\n- o2\n- ei1\n- iao2\n- uang4\n- \"\\xFC1\"\n- ui2\n- v3\n- uang2\n- iong4\n- un1\n- ui1\n- ua4\n- ao2\n- en\n- a\n- iu2\n- uang1\n- uang3\n- \"\\xFCe2\"\n- in3\n- \"\\uFF1F\"\n- uai4\n- \"\\xFCe4\"\n- uan2\n- ou2\n- eng3\n- ui3\n- uan4\n- a2\n- ie2\n- ong3\n- iang2\n- ie1\n- in4\n- iao1\n- e1\n- in2\n- en4\n- uan3\n- \"\\xFC2\"\n- ing3\n- i\n- ei2\n- ei3\n- iang1\n- er2\n- ia4\n- uo2\n- \"\\xFC3\"\n- uan1\n- ia1\n- e3\n- ong4\n- ie4\n- ai1\n- en3\n- iang3\n- eng4\n- iang4\n- ao1\n- ou1\n- ang2\n- ai3\n- iu4\n- \"\\xFCan2\"\n- ang3\n- en1\n- ong2\n- uei4\n- ei4\n- iao3\n- \"\\xFC4\"\n- an2\n- ing4\n- an3\n- a3\n- ie3\n- an1\n- ian3\n- uo1\n- ing1\n- ou4\n- ian1\n- ou3\n- eng1\n- ang1\n- in1\n- a4\n- eng2\n- uo4\n- u1\n- ang4\n- iou3\n- iao4\n- ian2\n- u2\n- ui4\n- e2\n- en2\n- u3\n- ing2\n- ao4\n- ong1\n- an4\n- ai2\n- ao3\n- uo3\n- ian4\n- p\n- c\n- a1\n- ai4\n- e4\n- s\n- k\n- r\n- i2\n- f\n- n\n- u4\n- ch\n- i3\n- i1\n- q\n- z\n- m\n- t\n- g\n- b\n- e\n- h\n- i4\n- x\n- \"\\uFF0C\"\n- zh\n- \"\\u3002\"\n- l\n- j\n- sh\n- d\n- \nodim: 80\nmodel_conf: {}\nuse_preprocessor: true\ntoken_type: phn\nbpemodel: null\nnon_linguistic_symbols: null\ncleaner: null\ng2p: pypinyin_g2p_phone\nfeats_extract: null\nfeats_extract_conf: null\nnormalize: global_mvn\nnormalize_conf:\n stats_file: exp/tts_stats_raw_phn_pypinyin_g2p_phone/train/feats_stats.npz\ntts: fastspeech\ntts_conf:\n adim: 384\n aheads: 2\n elayers: 6\n eunits: 1536\n dlayers: 6\n dunits: 1536\n positionwise_layer_type: conv1d\n positionwise_conv_kernel_size: 3\n duration_predictor_layers: 2\n duration_predictor_chans: 384\n duration_predictor_kernel_size: 3\n postnet_layers: 5\n postnet_filts: 5\n postnet_chans: 256\n use_masking: true\n use_scaled_pos_enc: true\n encoder_normalize_before: false\n decoder_normalize_before: false\n reduction_factor: 1\n init_type: xavier_uniform\n init_enc_alpha: 1.0\n init_dec_alpha: 1.0\n transformer_enc_dropout_rate: 0.1\n transformer_enc_positional_dropout_rate: 0.1\n transformer_enc_attn_dropout_rate: 0.1\n transformer_dec_dropout_rate: 0.1\n transformer_dec_positional_dropout_rate: 0.1\n transformer_dec_attn_dropout_rate: 0.1\npitch_extract: null\npitch_extract_conf:\n fs: 24000\n n_fft: 2048\n hop_length: 300\n f0max: 400\n f0min: 80\npitch_normalize: null\npitch_normalize_conf:\n stats_file: exp/tts_stats_raw_phn_pypinyin_g2p_phone/train/pitch_stats.npz\nenergy_extract: null\nenergy_extract_conf:\n fs: 24000\n n_fft: 2048\n hop_length: 300\n win_length: 1200\nenergy_normalize: null\nenergy_normalize_conf:\n stats_file: exp/tts_stats_raw_phn_pypinyin_g2p_phone/train/energy_stats.npz\nrequired:\n- output_dir\n- token_list\ndistributed: false
\n\t