Dataset Open Access

Lemmatized English Word2Vec data

Christian Chiarcos; Tomas Mikolov et al.


DCAT Export

<?xml version='1.0' encoding='utf-8'?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:adms="http://www.w3.org/ns/adms#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dct="http://purl.org/dc/terms/" xmlns:dctype="http://purl.org/dc/dcmitype/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:duv="http://www.w3.org/ns/duv#" xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:frapo="http://purl.org/cerif/frapo/" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:gsp="http://www.opengis.net/ont/geosparql#" xmlns:locn="http://www.w3.org/ns/locn#" xmlns:org="http://www.w3.org/ns/org#" xmlns:owl="http://www.w3.org/2002/07/owl#" xmlns:prov="http://www.w3.org/ns/prov#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:schema="http://schema.org/" xmlns:skos="http://www.w3.org/2004/02/skos/core#" xmlns:vcard="http://www.w3.org/2006/vcard/ns#" xmlns:wdrs="http://www.w3.org/2007/05/powder-s#">
  <rdf:Description rdf:about="https://doi.org/10.5281/zenodo.4421380">
    <rdf:type rdf:resource="http://www.w3.org/ns/dcat#Dataset"/>
    <dct:type rdf:resource="http://purl.org/dc/dcmitype/Dataset"/>
    <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#anyURI">https://doi.org/10.5281/zenodo.4421380</dct:identifier>
    <foaf:page rdf:resource="https://doi.org/10.5281/zenodo.4421380"/>
    <dct:creator>
      <rdf:Description rdf:about="http://orcid.org/0000-0002-4428-029X">
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">0000-0002-4428-029X</dct:identifier>
        <foaf:name>Christian Chiarcos</foaf:name>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>Goethe University Frankfurt, Germany</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description>
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <foaf:name>Tomas Mikolov et al.</foaf:name>
      </rdf:Description>
    </dct:creator>
    <dct:title>Lemmatized English Word2Vec data</dct:title>
    <dct:publisher>
      <foaf:Agent>
        <foaf:name>Zenodo</foaf:name>
      </foaf:Agent>
    </dct:publisher>
    <dct:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#gYear">2021</dct:issued>
    <dcat:keyword>word embeddings</dcat:keyword>
    <dcat:keyword>word2vec</dcat:keyword>
    <dcat:keyword>English</dcat:keyword>
    <frapo:isFundedBy rdf:resource="info:eu-repo/grantAgreement/EC/H2020/825182/"/>
    <schema:funder>
      <foaf:Organization>
        <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">10.13039/501100000780</dct:identifier>
        <foaf:name>European Commission</foaf:name>
      </foaf:Organization>
    </schema:funder>
    <dct:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2021-01-06</dct:issued>
    <dct:language rdf:resource="http://publications.europa.eu/resource/authority/language/ENG"/>
    <owl:sameAs rdf:resource="https://zenodo.org/record/4421380"/>
    <adms:identifier>
      <adms:Identifier>
        <skos:notation rdf:datatype="http://www.w3.org/2001/XMLSchema#anyURI">https://zenodo.org/record/4421380</skos:notation>
        <adms:schemeAgency>url</adms:schemeAgency>
      </adms:Identifier>
    </adms:identifier>
    <dct:isVersionOf rdf:resource="https://doi.org/10.5281/zenodo.4421379"/>
    <dct:isPartOf rdf:resource="https://zenodo.org/communities/nexuslinguarum"/>
    <dct:isPartOf rdf:resource="https://zenodo.org/communities/pret-a-llod"/>
    <owl:versionInfo>2020-06-03</owl:versionInfo>
    <dct:description>&lt;p&gt;# Lemmatized English Word2Vec data&lt;/p&gt; &lt;p&gt;This is a version of the original GoogleNews-vectors-negative300 Word2Vec embeddings for English.&lt;br&gt; In addition, we provide the following modified files:&lt;/p&gt; &lt;p&gt;- converted to conventional CSV format (and gzipped)&lt;br&gt; - subclassified:&lt;br&gt; &amp;nbsp; for the most frequent 1.000.000 words:&lt;br&gt; &amp;nbsp;&amp;nbsp; &amp;nbsp;subclassified according to WordNet parts of speech: ADJ, ADV, NOUN, VERB, OTHER&lt;br&gt; &amp;nbsp;&amp;nbsp; &amp;nbsp;note that one embedding can be associated with multiple parts of speech&lt;br&gt; &amp;nbsp; for the remaining words:&lt;br&gt; &amp;nbsp;&amp;nbsp;&amp;nbsp; RARE: top 1.000.001 - 2.000.000 words&lt;br&gt; &amp;nbsp;&amp;nbsp; &amp;nbsp;VERY_RARE: top 2.000.001 - 3.000.000 words&lt;br&gt; - WordNet lemmatization (via NLTK) in separate files&lt;br&gt; &amp;nbsp;&amp;nbsp; &amp;nbsp;(first lemma only)&lt;/p&gt; &lt;p&gt;Note that this is not a product of original research, but a derived work, deposited here as a point of permanent reference and as a building stone of subsequent research. For such application, a publication independent from Google is necessary to guarantee stability against changes in their data releases.&lt;/p&gt; &lt;p&gt;The original Word2vec code and data was published via https://code.google.com/archive/p/word2vec/ under an Apache License 2.0. We obtained the Word2vec data from&amp;nbsp; https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing on Jun 3, 2020.&lt;/p&gt; &lt;p&gt;The Word2vec documentation included the following references:&lt;/p&gt; &lt;p&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; [1] Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean. Efficient Estimation of Word Representations in Vector Space. In Proceedings of Workshop at ICLR, 2013.&lt;/p&gt; &lt;p&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; [2] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. Distributed Representations of Words and Phrases and their Compositionality. In Proceedings of NIPS, 2013.&lt;/p&gt; &lt;p&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; [3] Tomas Mikolov, Wen-tau Yih, and Geoffrey Zweig. Linguistic Regularities in Continuous Space Word Representations. In Proceedings of NAACL HLT, 2013.&lt;/p&gt; &lt;p&gt;The derived data is made available under the same license (Apache License 2.0). However, note that the content derived from WordNet (lemmas) are subject to the Princeton Wordnet license as stated in LICENSE.wordnet.&lt;/p&gt; &lt;p&gt;Data provided by the Applied Computational Linguistics Lab of the Goethe University Frankfurt, Germany. Original data developed by Mikolov et al.&lt;/p&gt;</dct:description>
    <dct:description>Partially funded by the German Federal Ministry of Education and Research (BMBF), project "Linked Open Dictionaries".</dct:description>
    <dct:accessRights rdf:resource="http://publications.europa.eu/resource/authority/access-right/PUBLIC"/>
    <dct:accessRights>
      <dct:RightsStatement rdf:about="info:eu-repo/semantics/openAccess">
        <rdfs:label>Open Access</rdfs:label>
      </dct:RightsStatement>
    </dct:accessRights>
    <dcat:distribution>
      <dcat:Distribution>
        <dct:rights>
          <dct:RightsStatement rdf:about="https://opensource.org/licenses/Apache-2.0">
            <rdfs:label>Apache License 2.0</rdfs:label>
          </dct:RightsStatement>
        </dct:rights>
        <dcat:accessURL rdf:resource="https://doi.org/10.5281/zenodo.4421380"/>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL>https://doi.org/10.5281/zenodo.4421380</dcat:accessURL>
        <dcat:byteSize>1647046227</dcat:byteSize>
        <dcat:downloadURL>https://zenodo.org/record/4421380/files/GoogleNews-vectors-negative300.bin.gz</dcat:downloadURL>
        <dcat:mediaType>application/octet-stream</dcat:mediaType>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL>https://doi.org/10.5281/zenodo.4421380</dcat:accessURL>
        <dcat:byteSize>10792573</dcat:byteSize>
        <dcat:downloadURL>https://zenodo.org/record/4421380/files/GoogleNews-vectors-negative300.txt_ADJ.csv.gz</dcat:downloadURL>
        <dcat:mediaType>text/csv</dcat:mediaType>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL>https://doi.org/10.5281/zenodo.4421380</dcat:accessURL>
        <dcat:byteSize>242847</dcat:byteSize>
        <dcat:downloadURL>https://zenodo.org/record/4421380/files/GoogleNews-vectors-negative300.txt_ADJ.lemmas</dcat:downloadURL>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL>https://doi.org/10.5281/zenodo.4421380</dcat:accessURL>
        <dcat:byteSize>4317304</dcat:byteSize>
        <dcat:downloadURL>https://zenodo.org/record/4421380/files/GoogleNews-vectors-negative300.txt_ADV.csv.gz</dcat:downloadURL>
        <dcat:mediaType>text/csv</dcat:mediaType>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL>https://doi.org/10.5281/zenodo.4421380</dcat:accessURL>
        <dcat:byteSize>108811</dcat:byteSize>
        <dcat:downloadURL>https://zenodo.org/record/4421380/files/GoogleNews-vectors-negative300.txt_ADV.lemmas</dcat:downloadURL>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL>https://doi.org/10.5281/zenodo.4421380</dcat:accessURL>
        <dcat:byteSize>309467790</dcat:byteSize>
        <dcat:downloadURL>https://zenodo.org/record/4421380/files/GoogleNews-vectors-negative300.txt_NOUN.csv.gz</dcat:downloadURL>
        <dcat:mediaType>text/csv</dcat:mediaType>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL>https://doi.org/10.5281/zenodo.4421380</dcat:accessURL>
        <dcat:byteSize>6327664</dcat:byteSize>
        <dcat:downloadURL>https://zenodo.org/record/4421380/files/GoogleNews-vectors-negative300.txt_NOUN.lemmas</dcat:downloadURL>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL>https://doi.org/10.5281/zenodo.4421380</dcat:accessURL>
        <dcat:byteSize>443820238</dcat:byteSize>
        <dcat:downloadURL>https://zenodo.org/record/4421380/files/GoogleNews-vectors-negative300.txt_OTHER.csv.gz</dcat:downloadURL>
        <dcat:mediaType>text/csv</dcat:mediaType>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL>https://doi.org/10.5281/zenodo.4421380</dcat:accessURL>
        <dcat:byteSize>21933</dcat:byteSize>
        <dcat:downloadURL>https://zenodo.org/record/4421380/files/GoogleNews-vectors-negative300.txt_OTHER.lemmas</dcat:downloadURL>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL>https://doi.org/10.5281/zenodo.4421380</dcat:accessURL>
        <dcat:byteSize>811570257</dcat:byteSize>
        <dcat:downloadURL>https://zenodo.org/record/4421380/files/GoogleNews-vectors-negative300.txt_RARE.csv.gz</dcat:downloadURL>
        <dcat:mediaType>text/csv</dcat:mediaType>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL>https://doi.org/10.5281/zenodo.4421380</dcat:accessURL>
        <dcat:byteSize>38571098</dcat:byteSize>
        <dcat:downloadURL>https://zenodo.org/record/4421380/files/GoogleNews-vectors-negative300.txt_VERB.csv.gz</dcat:downloadURL>
        <dcat:mediaType>text/csv</dcat:mediaType>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL>https://doi.org/10.5281/zenodo.4421380</dcat:accessURL>
        <dcat:byteSize>775339</dcat:byteSize>
        <dcat:downloadURL>https://zenodo.org/record/4421380/files/GoogleNews-vectors-negative300.txt_VERB.lemmas</dcat:downloadURL>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL>https://doi.org/10.5281/zenodo.4421380</dcat:accessURL>
        <dcat:byteSize>824637315</dcat:byteSize>
        <dcat:downloadURL>https://zenodo.org/record/4421380/files/GoogleNews-vectors-negative300.txt_VERY_RARE.csv.gz</dcat:downloadURL>
        <dcat:mediaType>text/csv</dcat:mediaType>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL>https://doi.org/10.5281/zenodo.4421380</dcat:accessURL>
        <dcat:byteSize>11358</dcat:byteSize>
        <dcat:downloadURL>https://zenodo.org/record/4421380/files/LICENSE</dcat:downloadURL>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL>https://doi.org/10.5281/zenodo.4421380</dcat:accessURL>
        <dcat:byteSize>1590</dcat:byteSize>
        <dcat:downloadURL>https://zenodo.org/record/4421380/files/LICENSE.wordnet</dcat:downloadURL>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL>https://doi.org/10.5281/zenodo.4421380</dcat:accessURL>
        <dcat:byteSize>2000</dcat:byteSize>
        <dcat:downloadURL>https://zenodo.org/record/4421380/files/README.md</dcat:downloadURL>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL>https://doi.org/10.5281/zenodo.4421380</dcat:accessURL>
        <dcat:byteSize>1209</dcat:byteSize>
        <dcat:downloadURL>https://zenodo.org/record/4421380/files/README.word2vec</dcat:downloadURL>
      </dcat:Distribution>
    </dcat:distribution>
  </rdf:Description>
  <foaf:Project rdf:about="info:eu-repo/grantAgreement/EC/H2020/825182/">
    <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">825182</dct:identifier>
    <dct:title>Ready-to-use Multilingual Linked Language Data for Knowledge Services across Sectors</dct:title>
    <frapo:isAwardedBy>
      <foaf:Organization>
        <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">10.13039/501100000780</dct:identifier>
        <foaf:name>European Commission</foaf:name>
      </foaf:Organization>
    </frapo:isAwardedBy>
  </foaf:Project>
</rdf:RDF>
55
41
views
downloads
All versions This version
Views 5555
Downloads 4141
Data volume 11.5 GB11.5 GB
Unique views 5353
Unique downloads 1010

Share

Cite as