Journal article Open Access

Adaptive Refinements of Pitch Tracking and HNR Estimation within a Vocoder for Statistical Parametric Speech Synthesis

Al-Radhi, Mohammed Salah; Csapó, Tamás Gábor; Németh, Géza


DCAT Export

<?xml version='1.0' encoding='utf-8'?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:adms="http://www.w3.org/ns/adms#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dct="http://purl.org/dc/terms/" xmlns:dctype="http://purl.org/dc/dcmitype/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:duv="http://www.w3.org/ns/duv#" xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:frapo="http://purl.org/cerif/frapo/" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:gsp="http://www.opengis.net/ont/geosparql#" xmlns:locn="http://www.w3.org/ns/locn#" xmlns:org="http://www.w3.org/ns/org#" xmlns:owl="http://www.w3.org/2002/07/owl#" xmlns:prov="http://www.w3.org/ns/prov#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:schema="http://schema.org/" xmlns:skos="http://www.w3.org/2004/02/skos/core#" xmlns:vcard="http://www.w3.org/2006/vcard/ns#" xmlns:wdrs="http://www.w3.org/2007/05/powder-s#">
  <rdf:Description rdf:about="https://zenodo.org/record/5729320">
    <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#anyURI">https://zenodo.org/record/5729320</dct:identifier>
    <foaf:page rdf:resource="https://zenodo.org/record/5729320"/>
    <dct:creator>
      <rdf:Description rdf:about="http://orcid.org/0000-0003-3094-6916">
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">0000-0003-3094-6916</dct:identifier>
        <foaf:name>Al-Radhi, Mohammed Salah</foaf:name>
        <foaf:givenName>Mohammed Salah</foaf:givenName>
        <foaf:familyName>Al-Radhi</foaf:familyName>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>Budapest University of Technology and Economics</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description rdf:about="http://orcid.org/0000-0003-4375-7524">
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">0000-0003-4375-7524</dct:identifier>
        <foaf:name>Csapó, Tamás Gábor</foaf:name>
        <foaf:givenName>Tamás Gábor</foaf:givenName>
        <foaf:familyName>Csapó</foaf:familyName>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>Budapest University of Technology and Economics</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description rdf:about="http://orcid.org/0000-0002-2311-4858">
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">0000-0002-2311-4858</dct:identifier>
        <foaf:name>Németh, Géza</foaf:name>
        <foaf:givenName>Géza</foaf:givenName>
        <foaf:familyName>Németh</foaf:familyName>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>Budapest University of Technology and Economics</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:title>Adaptive Refinements of Pitch Tracking and HNR Estimation within a Vocoder for Statistical Parametric Speech Synthesis</dct:title>
    <dct:publisher>
      <foaf:Agent>
        <foaf:name>Zenodo</foaf:name>
      </foaf:Agent>
    </dct:publisher>
    <dct:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#gYear">2019</dct:issued>
    <dcat:keyword>continuous F0</dcat:keyword>
    <dcat:keyword>speech synthesis</dcat:keyword>
    <dcat:keyword>time-warping</dcat:keyword>
    <frapo:isFundedBy rdf:resource="info:eu-repo/grantAgreement/EC/H2020/825619/"/>
    <schema:funder>
      <foaf:Organization>
        <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">10.13039/100010661</dct:identifier>
        <foaf:name>European Commission</foaf:name>
      </foaf:Organization>
    </schema:funder>
    <dct:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2019-06-16</dct:issued>
    <dct:language rdf:resource="http://publications.europa.eu/resource/authority/language/ENG"/>
    <owl:sameAs rdf:resource="https://zenodo.org/record/5729320"/>
    <adms:identifier>
      <adms:Identifier>
        <skos:notation rdf:datatype="http://www.w3.org/2001/XMLSchema#anyURI">https://zenodo.org/record/5729320</skos:notation>
        <adms:schemeAgency>url</adms:schemeAgency>
      </adms:Identifier>
    </adms:identifier>
    <dct:relation rdf:resource="https://doi.org/10.3390/app9122460"/>
    <owl:sameAs rdf:resource="https://doi.org/10.3390/app9122460"/>
    <dct:isPartOf rdf:resource="https://zenodo.org/communities/ai4eu"/>
    <owl:versionInfo>1</owl:versionInfo>
    <dct:description>&lt;p&gt;Recent studies in text-to-speech synthesis have shown the benefit of using a continuous pitch estimate; one that interpolates fundamental frequency (F0) even when voicing is not present. However, continuous F0 is still sensitive to additive noise in speech signals and suffers from short-term errors (when it changes rather quickly over time). To alleviate these issues, three adaptive techniques have been developed in this article for achieving a robust and accurate F0: (1) we weight the pitch estimates with state noise covariance using adaptive Kalman-filter framework, (2) we iteratively apply a time axis warping on the input frame signal, (3) we optimize all F0 candidates using an instantaneous-frequency-based approach. Additionally, the second goal of this study is to introduce an extension of a novel continuous-based speech synthesis system (i.e., in which all parameters are continuous). We propose adding a new excitation parameter named Harmonic-to-Noise Ratio (HNR) to the voiced and unvoiced components to indicate the degree of voicing in the excitation and to reduce the influence of buzziness caused by the vocoder. Results based on objective and perceptual tests demonstrate that the voice built with the proposed framework gives state-of-the-art speech synthesis performance while outperforming the previous baseline.&amp;nbsp;&lt;/p&gt;</dct:description>
    <dct:description>1</dct:description>
    <dct:accessRights rdf:resource="http://publications.europa.eu/resource/authority/access-right/PUBLIC"/>
    <dct:accessRights>
      <dct:RightsStatement rdf:about="info:eu-repo/semantics/openAccess">
        <rdfs:label>Open Access</rdfs:label>
      </dct:RightsStatement>
    </dct:accessRights>
    <dct:license rdf:resource="https://creativecommons.org/licenses/by/4.0/legalcode"/>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL rdf:resource="https://doi.org/10.3390/app9122460"/>
        <dcat:byteSize>3595588</dcat:byteSize>
        <dcat:downloadURL rdf:resource="https://zenodo.org/record/5729320/files/applsci-09-02460.pdf"/>
        <dcat:mediaType>application/pdf</dcat:mediaType>
      </dcat:Distribution>
    </dcat:distribution>
  </rdf:Description>
  <foaf:Project rdf:about="info:eu-repo/grantAgreement/EC/H2020/825619/">
    <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">825619</dct:identifier>
    <dct:title>A European AI On Demand Platform and Ecosystem</dct:title>
    <frapo:isAwardedBy>
      <foaf:Organization>
        <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">10.13039/100010661</dct:identifier>
        <foaf:name>European Commission</foaf:name>
      </foaf:Organization>
    </frapo:isAwardedBy>
  </foaf:Project>
</rdf:RDF>
19
18
views
downloads
Views 19
Downloads 18
Data volume 64.7 MB
Unique views 15
Unique downloads 15

Share

Cite as