Conference paper Open Access

Continuous Wavelet Vocoder-Based Decomposition of Parametric Speech Waveform Synthesis

Al-Radhi, Mohammed Salah; Csapó, Tamás Gábor; Németh, Géza


DCAT Export

<?xml version='1.0' encoding='utf-8'?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:adms="http://www.w3.org/ns/adms#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dct="http://purl.org/dc/terms/" xmlns:dctype="http://purl.org/dc/dcmitype/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:duv="http://www.w3.org/ns/duv#" xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:frapo="http://purl.org/cerif/frapo/" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:gsp="http://www.opengis.net/ont/geosparql#" xmlns:locn="http://www.w3.org/ns/locn#" xmlns:org="http://www.w3.org/ns/org#" xmlns:owl="http://www.w3.org/2002/07/owl#" xmlns:prov="http://www.w3.org/ns/prov#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:schema="http://schema.org/" xmlns:skos="http://www.w3.org/2004/02/skos/core#" xmlns:vcard="http://www.w3.org/2006/vcard/ns#" xmlns:wdrs="http://www.w3.org/2007/05/powder-s#">
  <rdf:Description rdf:about="https://zenodo.org/record/5730361">
    <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#anyURI">https://zenodo.org/record/5730361</dct:identifier>
    <foaf:page rdf:resource="https://zenodo.org/record/5730361"/>
    <dct:creator>
      <rdf:Description rdf:about="http://orcid.org/0000-0003-3094-6916">
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">0000-0003-3094-6916</dct:identifier>
        <foaf:name>Al-Radhi, Mohammed Salah</foaf:name>
        <foaf:givenName>Mohammed Salah</foaf:givenName>
        <foaf:familyName>Al-Radhi</foaf:familyName>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>Budapest University of Technology and Economics</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description rdf:about="http://orcid.org/0000-0003-4375-7524">
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">0000-0003-4375-7524</dct:identifier>
        <foaf:name>Csapó, Tamás Gábor</foaf:name>
        <foaf:givenName>Tamás Gábor</foaf:givenName>
        <foaf:familyName>Csapó</foaf:familyName>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>Budapest University of Technology and Economics</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description rdf:about="http://orcid.org/0000-0002-2311-4858">
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">0000-0002-2311-4858</dct:identifier>
        <foaf:name>Németh, Géza</foaf:name>
        <foaf:givenName>Géza</foaf:givenName>
        <foaf:familyName>Németh</foaf:familyName>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>Budapest University of Technology and Economics</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:title>Continuous Wavelet Vocoder-Based Decomposition of Parametric Speech Waveform Synthesis</dct:title>
    <dct:publisher>
      <foaf:Agent>
        <foaf:name>Zenodo</foaf:name>
      </foaf:Agent>
    </dct:publisher>
    <dct:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#gYear">2021</dct:issued>
    <dcat:keyword>wavelet model</dcat:keyword>
    <dcat:keyword>speech synthesis</dcat:keyword>
    <dcat:keyword>statistical features</dcat:keyword>
    <dcat:keyword>continuous vocoder</dcat:keyword>
    <frapo:isFundedBy rdf:resource="info:eu-repo/grantAgreement/EC/H2020/825619/"/>
    <schema:funder>
      <foaf:Organization>
        <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">10.13039/100010661</dct:identifier>
        <foaf:name>European Commission</foaf:name>
      </foaf:Organization>
    </schema:funder>
    <dct:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2021-09-03</dct:issued>
    <dct:language rdf:resource="http://publications.europa.eu/resource/authority/language/ENG"/>
    <owl:sameAs rdf:resource="https://zenodo.org/record/5730361"/>
    <adms:identifier>
      <adms:Identifier>
        <skos:notation rdf:datatype="http://www.w3.org/2001/XMLSchema#anyURI">https://zenodo.org/record/5730361</skos:notation>
        <adms:schemeAgency>url</adms:schemeAgency>
      </adms:Identifier>
    </adms:identifier>
    <owl:sameAs rdf:resource="https://doi.org/10.21437/Interspeech.2021-1600"/>
    <dct:isPartOf rdf:resource="https://zenodo.org/communities/ai4eu"/>
    <owl:versionInfo>1</owl:versionInfo>
    <dct:description>&lt;p&gt;To date, various speech technology systems have adopted the vocoder approach, a method for synthesizing speech waveform that shows a major role in the performance of statistical parametric speech synthesis. However, conventional sourcefilter systems (i.e., STRAIGHT) and sinusoidal models (i.e., MagPhase) tend to produce over-smoothed spectra, which often result in muffled and buzzy synthesized text-to-speech (TTS). WaveNet, one of the best models that nearly resembles the human voice, has to generate a waveform in a time-consuming sequential manner with an extremely complex structure of its neural networks. WaveNet needs large quantities of voice data before accurate predictions can be obtained. In order to motivate a new, alternative approach to these issues, we present an updated synthesizer, which is a simple signal model to train and easy to generate waveforms, using Continuous Wavelet Transform (CWT) to characterize and decompose speech features. CWT provides time and frequency resolutions different from those of the short-time Fourier transform. It can also retain the fine spectral envelope and achieve high controllability of the structure closer to human auditory scales. We confirmed through experiments that our speech synthesis system was able to provide natural-sounding synthetic speech and outperformed the state-of-the-art WaveNet vocoder.&lt;/p&gt;</dct:description>
    <dct:accessRights rdf:resource="http://publications.europa.eu/resource/authority/access-right/PUBLIC"/>
    <dct:accessRights>
      <dct:RightsStatement rdf:about="info:eu-repo/semantics/openAccess">
        <rdfs:label>Open Access</rdfs:label>
      </dct:RightsStatement>
    </dct:accessRights>
    <dct:license rdf:resource="https://creativecommons.org/licenses/by/4.0/legalcode"/>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL rdf:resource="https://doi.org/10.21437/Interspeech.2021-1600"/>
        <dcat:byteSize>474917</dcat:byteSize>
        <dcat:downloadURL rdf:resource="https://zenodo.org/record/5730361/files/alradhi21_interspeech.pdf"/>
        <dcat:mediaType>application/pdf</dcat:mediaType>
      </dcat:Distribution>
    </dcat:distribution>
  </rdf:Description>
  <foaf:Project rdf:about="info:eu-repo/grantAgreement/EC/H2020/825619/">
    <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">825619</dct:identifier>
    <dct:title>A European AI On Demand Platform and Ecosystem</dct:title>
    <frapo:isAwardedBy>
      <foaf:Organization>
        <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">10.13039/100010661</dct:identifier>
        <foaf:name>European Commission</foaf:name>
      </foaf:Organization>
    </frapo:isAwardedBy>
  </foaf:Project>
</rdf:RDF>
21
19
views
downloads
Views 21
Downloads 19
Data volume 9.0 MB
Unique views 17
Unique downloads 19

Share

Cite as