Other Open Access
kan-bayashi
{ "files": [ { "links": { "self": "https://zenodo.org/api/files/b9892620-3772-4c93-95b4-c86b76d26b0c/tts_train_transformer_raw_phn_jaconv_pyopenjtalk_accent_with_pause_train.loss.ave.zip" }, "checksum": "md5:d843f8183d6916507a0ddddc7cdfcc21", "bucket": "b9892620-3772-4c93-95b4-c86b76d26b0c", "key": "tts_train_transformer_raw_phn_jaconv_pyopenjtalk_accent_with_pause_train.loss.ave.zip", "type": "zip", "size": 133238771 } ], "owners": [ 116548 ], "doi": "10.5281/zenodo.4433196", "stats": { "version_unique_downloads": 130.0, "unique_views": 51.0, "views": 57.0, "version_views": 57.0, "unique_downloads": 130.0, "version_unique_views": 51.0, "volume": 23849740009.0, "version_downloads": 179.0, "downloads": 179.0, "version_volume": 23849740009.0 }, "links": { "doi": "https://doi.org/10.5281/zenodo.4433196", "conceptdoi": "https://doi.org/10.5281/zenodo.4433195", "bucket": "https://zenodo.org/api/files/b9892620-3772-4c93-95b4-c86b76d26b0c", "conceptbadge": "https://zenodo.org/badge/doi/10.5281/zenodo.4433195.svg", "html": "https://zenodo.org/record/4433196", "latest_html": "https://zenodo.org/record/4433196", "badge": "https://zenodo.org/badge/doi/10.5281/zenodo.4433196.svg", "latest": "https://zenodo.org/api/records/4433196" }, "conceptdoi": "10.5281/zenodo.4433195", "created": "2021-01-12T03:16:59.134961+00:00", "updated": "2022-02-07T09:42:38.521910+00:00", "conceptrecid": "4433195", "revision": 3, "id": 4433196, "metadata": { "access_right_category": "success", "doi": "10.5281/zenodo.4433196", "description": "<p>This model was trained by kan-bayashi using jsut/tts1 recipe in <a href=\"https://github.com/espnet/espnet/\">espnet</a>.</p>\n\n<p> </p>\n\n<ul>\n\t<li><strong>Python API</strong>\n\n\t<pre><code class=\"language-python\">See https://github.com/espnet/espnet_model_zoo</code></pre>\n\t</li>\n\t<li><strong>Evaluate in the recipe</strong>\n\t<pre><code class=\"language-bash\">git clone https://github.com/espnet/espnet\ncd espnet\ngit checkout 18fb6edb7b14911730337baa05f0e40c4dde9002\npip install -e .\ncd egs2/jsut/tts1\n# Download the model file here\n./run.sh --skip_data_prep false --skip_train true --download_model kan-bayashi/jsut_tts_train_transformer_raw_phn_jaconv_pyopenjtalk_accent_with_pause_train.loss.ave</code>\n</pre>\n\t</li>\n\t<li><strong>Config</strong>\n\t<pre><code>config: conf/tuning/train_transformer.yaml\nprint_config: false\nlog_level: INFO\ndry_run: false\niterator_type: sequence\noutput_dir: exp/tts_train_transformer_raw_phn_jaconv_pyopenjtalk_accent_with_pause\nngpu: 1\nseed: 0\nnum_workers: 1\nnum_att_plot: 3\ndist_backend: nccl\ndist_init_method: env://\ndist_world_size: 4\ndist_rank: 0\nlocal_rank: 0\ndist_master_addr: localhost\ndist_master_port: 58625\ndist_launcher: null\nmultiprocessing_distributed: true\ncudnn_enabled: true\ncudnn_benchmark: false\ncudnn_deterministic: true\ncollect_stats: false\nwrite_collected_feats: false\nmax_epoch: 200\npatience: null\nval_scheduler_criterion:\n- valid\n- loss\nearly_stopping_criterion:\n- valid\n- loss\n- min\nbest_model_criterion:\n- - valid\n - loss\n - min\n- - train\n - loss\n - min\nkeep_nbest_models: 5\ngrad_clip: 1.0\ngrad_clip_type: 2.0\ngrad_noise: false\naccum_grad: 2\nno_forward_run: false\nresume: true\ntrain_dtype: float32\nuse_amp: false\nlog_interval: null\nunused_parameters: false\nuse_tensorboard: true\nuse_wandb: false\nwandb_project: null\nwandb_id: null\npretrain_path: null\ninit_param: []\nfreeze_param: []\nnum_iters_per_epoch: 1000\nbatch_size: 20\nvalid_batch_size: null\nbatch_bins: 9000000\nvalid_batch_bins: null\ntrain_shape_file:\n- exp/tts_stats_raw_phn_jaconv_pyopenjtalk_accent_with_pause/train/text_shape.phn\n- exp/tts_stats_raw_phn_jaconv_pyopenjtalk_accent_with_pause/train/speech_shape\nvalid_shape_file:\n- exp/tts_stats_raw_phn_jaconv_pyopenjtalk_accent_with_pause/valid/text_shape.phn\n- exp/tts_stats_raw_phn_jaconv_pyopenjtalk_accent_with_pause/valid/speech_shape\nbatch_type: numel\nvalid_batch_type: null\nfold_length:\n- 150\n- 240000\nsort_in_batch: descending\nsort_batch: descending\nmultiple_iterator: false\nchunk_length: 500\nchunk_shift_ratio: 0.5\nnum_cache_chunks: 1024\ntrain_data_path_and_name_and_type:\n- - dump/raw/tr_no_dev/text\n - text\n - text\n- - dump/raw/tr_no_dev/wav.scp\n - speech\n - sound\nvalid_data_path_and_name_and_type:\n- - dump/raw/dev/text\n - text\n - text\n- - dump/raw/dev/wav.scp\n - speech\n - sound\nallow_variable_data_keys: false\nmax_cache_size: 0.0\nmax_cache_fd: 32\nvalid_max_cache_size: null\noptim: adam\noptim_conf:\n lr: 1.0\nscheduler: noamlr\nscheduler_conf:\n model_size: 512\n warmup_steps: 8000\ntoken_list:\n- \n- \n- '1'\n- '2'\n- '0'\n- '3'\n- '4'\n- '-1'\n- '5'\n- a\n- o\n- '-2'\n- i\n- '-3'\n- u\n- e\n- k\n- n\n- t\n- '6'\n- r\n- '-4'\n- s\n- N\n- m\n- pau\n- '7'\n- sh\n- d\n- g\n- w\n- '8'\n- U\n- '-5'\n- I\n- cl\n- h\n- y\n- b\n- '9'\n- j\n- ts\n- ch\n- '-6'\n- z\n- p\n- '-7'\n- f\n- ky\n- ry\n- '-8'\n- gy\n- '-9'\n- hy\n- ny\n- '-10'\n- by\n- my\n- '-11'\n- '-12'\n- '-13'\n- py\n- '-14'\n- '-15'\n- v\n- '10'\n- '-16'\n- '-17'\n- '11'\n- '-21'\n- '-20'\n- '12'\n- '-19'\n- '13'\n- '-18'\n- '14'\n- dy\n- '15'\n- ty\n- '-22'\n- '16'\n- '18'\n- '19'\n- '17'\n- \nodim: null\nmodel_conf: {}\nuse_preprocessor: true\ntoken_type: phn\nbpemodel: null\nnon_linguistic_symbols: null\ncleaner: jaconv\ng2p: pyopenjtalk_accent_with_pause\nfeats_extract: fbank\nfeats_extract_conf:\n fs: 24000\n fmin: 80\n fmax: 7600\n n_mels: 80\n hop_length: 300\n n_fft: 2048\n win_length: 1200\nnormalize: global_mvn\nnormalize_conf:\n stats_file: exp/tts_stats_raw_phn_jaconv_pyopenjtalk_accent_with_pause/train/feats_stats.npz\ntts: transformer\ntts_conf:\n embed_dim: 0\n eprenet_conv_layers: 0\n eprenet_conv_filts: 0\n eprenet_conv_chans: 0\n dprenet_layers: 2\n dprenet_units: 256\n adim: 512\n aheads: 8\n elayers: 6\n eunits: 1024\n dlayers: 6\n dunits: 1024\n positionwise_layer_type: conv1d\n positionwise_conv_kernel_size: 1\n postnet_layers: 5\n postnet_filts: 5\n postnet_chans: 256\n use_masking: true\n bce_pos_weight: 5.0\n use_scaled_pos_enc: true\n encoder_normalize_before: true\n decoder_normalize_before: true\n reduction_factor: 1\n init_type: xavier_uniform\n init_enc_alpha: 1.0\n init_dec_alpha: 1.0\n eprenet_dropout_rate: 0.0\n dprenet_dropout_rate: 0.5\n postnet_dropout_rate: 0.5\n transformer_enc_dropout_rate: 0.1\n transformer_enc_positional_dropout_rate: 0.1\n transformer_enc_attn_dropout_rate: 0.1\n transformer_dec_dropout_rate: 0.1\n transformer_dec_positional_dropout_rate: 0.1\n transformer_dec_attn_dropout_rate: 0.1\n transformer_enc_dec_attn_dropout_rate: 0.1\n use_guided_attn_loss: true\n num_heads_applied_guided_attn: 2\n num_layers_applied_guided_attn: 2\n modules_applied_guided_attn:\n - encoder-decoder\n guided_attn_loss_sigma: 0.4\n guided_attn_loss_lambda: 10.0\npitch_extract: null\npitch_extract_conf: {}\npitch_normalize: null\npitch_normalize_conf: {}\nenergy_extract: null\nenergy_extract_conf: {}\nenergy_normalize: null\nenergy_normalize_conf: {}\nrequired:\n- output_dir\n- token_list\ndistributed: true</code></pre>\n\t</li>\n</ul>", "license": { "id": "CC-BY-NC-SA-4.0" }, "title": "ESPnet2 pretrained model, kan-bayashi/jsut_tts_train_transformer_raw_phn_jaconv_pyopenjtalk_accent_with_pause_train.loss.ave, fs=24000, lang=jp", "relations": { "version": [ { "count": 1, "index": 0, "parent": { "pid_type": "recid", "pid_value": "4433195" }, "is_last": true, "last_child": { "pid_type": "recid", "pid_value": "4433196" } } ] }, "communities": [ { "id": "espnet" } ], "keywords": [ "ESPnet", "deep-learning", "python", "pytorch", "speech-recognition", "speech-synthesis", "speech-translation", "machine-translation" ], "publication_date": "2021-01-12", "creators": [ { "name": "kan-bayashi" } ], "access_right": "open", "resource_type": { "type": "other", "title": "Other" }, "related_identifiers": [ { "scheme": "url", "identifier": "https://github.com/espnet/espnet", "relation": "isSupplementTo" }, { "scheme": "doi", "identifier": "10.5281/zenodo.4433195", "relation": "isVersionOf" } ] } }
All versions | This version | |
---|---|---|
Views | 57 | 57 |
Downloads | 179 | 179 |
Data volume | 23.8 GB | 23.8 GB |
Unique views | 51 | 51 |
Unique downloads | 130 | 130 |