There is a newer version of this record available.

Dataset Open Access

High quality protein residues: top2018 mainchain-filtered residues

Williams, Christopher; Richardson, David; Richardson, Jane


DCAT Export

<?xml version='1.0' encoding='utf-8'?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:adms="http://www.w3.org/ns/adms#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dct="http://purl.org/dc/terms/" xmlns:dctype="http://purl.org/dc/dcmitype/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:duv="http://www.w3.org/ns/duv#" xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:frapo="http://purl.org/cerif/frapo/" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:gsp="http://www.opengis.net/ont/geosparql#" xmlns:locn="http://www.w3.org/ns/locn#" xmlns:org="http://www.w3.org/ns/org#" xmlns:owl="http://www.w3.org/2002/07/owl#" xmlns:prov="http://www.w3.org/ns/prov#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:schema="http://schema.org/" xmlns:skos="http://www.w3.org/2004/02/skos/core#" xmlns:vcard="http://www.w3.org/2006/vcard/ns#" xmlns:wdrs="http://www.w3.org/2007/05/powder-s#">
  <rdf:Description rdf:about="https://doi.org/10.5281/zenodo.4626150">
    <rdf:type rdf:resource="http://www.w3.org/ns/dcat#Dataset"/>
    <dct:type rdf:resource="http://purl.org/dc/dcmitype/Dataset"/>
    <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#anyURI">https://doi.org/10.5281/zenodo.4626150</dct:identifier>
    <foaf:page rdf:resource="https://doi.org/10.5281/zenodo.4626150"/>
    <dct:creator>
      <rdf:Description rdf:about="http://orcid.org/0000-0002-5808-8768">
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">0000-0002-5808-8768</dct:identifier>
        <foaf:name>Williams, Christopher</foaf:name>
        <foaf:givenName>Christopher</foaf:givenName>
        <foaf:familyName>Williams</foaf:familyName>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>Duke University</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description rdf:about="http://orcid.org/0000-0001-5069-343X">
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">0000-0001-5069-343X</dct:identifier>
        <foaf:name>Richardson, David</foaf:name>
        <foaf:givenName>David</foaf:givenName>
        <foaf:familyName>Richardson</foaf:familyName>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>Duke University</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description rdf:about="http://orcid.org/0000-0002-3311-2944">
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">0000-0002-3311-2944</dct:identifier>
        <foaf:name>Richardson, Jane</foaf:name>
        <foaf:givenName>Jane</foaf:givenName>
        <foaf:familyName>Richardson</foaf:familyName>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>Duke University</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:title>High quality protein residues: top2018 mainchain-filtered residues</dct:title>
    <dct:publisher>
      <foaf:Agent>
        <foaf:name>Zenodo</foaf:name>
      </foaf:Agent>
    </dct:publisher>
    <dct:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#gYear">2021</dct:issued>
    <dct:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2021-03-21</dct:issued>
    <owl:sameAs rdf:resource="https://zenodo.org/record/4626150"/>
    <adms:identifier>
      <adms:Identifier>
        <skos:notation rdf:datatype="http://www.w3.org/2001/XMLSchema#anyURI">https://zenodo.org/record/4626150</skos:notation>
        <adms:schemeAgency>url</adms:schemeAgency>
      </adms:Identifier>
    </adms:identifier>
    <dct:isVersionOf rdf:resource="https://doi.org/10.5281/zenodo.4626149"/>
    <owl:versionInfo>0.9</owl:versionInfo>
    <dct:description>&lt;p&gt;Introduction&lt;br&gt; --------------------------------------------------------------------------------&lt;br&gt; This directory contains files from the top2018 dataset by the Richardson Lab at Duke University.&lt;/p&gt; &lt;p&gt;These are high-quality residues from high-quality, low redundancy protein chains in the PDB.&lt;/p&gt; &lt;p&gt;&lt;br&gt; Usage recommendations&lt;br&gt; --------------------------------------------------------------------------------&lt;br&gt; Protein residues that fail the filtering criteria described below have been removed from the files.&amp;nbsp; As a result, these files can be considered pre-filtered and will return only results for residues of good model quality with supporting experimental data.&amp;nbsp; As long as the question concerns mainchain protein atoms, these files should be usable as is.&lt;/p&gt; &lt;p&gt;The top2018 contains several different levels of homology clustering to ensure nonredundant datasets.&amp;nbsp; The 70% homology level is a reliable default.&amp;nbsp; These chains are listed in top2018_chains_hom70_60pct_complete.txt&lt;/p&gt; &lt;p&gt;Files are organized in subdirectories based on the first two letters of their PDB ids.&lt;/p&gt; &lt;p&gt;Files already contain hydrogens added by Reduce.&amp;nbsp; NQH flips have been performed to ensure that these are the best versions of these structures.&lt;/p&gt; &lt;p&gt;top2018_metadata_mcfilter.csv contains information on release data, resolution, and validation scores.&lt;/p&gt; &lt;p&gt;top2018_passrates_mc_filtered.csv contains information on how many protein residues from the original chain passed the quality filters.&lt;/p&gt; &lt;p&gt;&lt;br&gt; Homology sets:&lt;br&gt; --------------------------------------------------------------------------------&lt;br&gt; Using sequence homology clusters provided by the RCSB PDB, for each homology cluster, the best chain was selected for inclusion in the dataset.&amp;nbsp; This ensures minimal sequence/structural redundancy.&lt;/p&gt; &lt;p&gt;The top2018 is available at several different levels of homology clustering, which may be appropriate to different uses.&amp;nbsp; Lists of the included chains at each homology level are included in this distribution.&lt;/p&gt; &lt;p&gt;Lower homology numbers mean greater variety and less redundancy, but also fewer total chains in the dataset.&lt;/p&gt; &lt;p&gt;For general use, ***we recommend the 70% homology set*** as a good balance between inclusivity and variety. This list is given in the file top2018_chains_hom70_60pct_complete.txt&lt;/p&gt; &lt;p&gt;&lt;br&gt; Usage caveats:&lt;/p&gt; &lt;p&gt;--------------------------------------------------------------------------------&lt;br&gt; These files are incomplete.&amp;nbsp; They are single chains from structures that may have had multiple chains.&amp;nbsp; Residues that fail the filtering criteria have been removed.&amp;nbsp; Programs with strong requirements for completeness or uninterrupted chains should be used with care.&lt;/p&gt; &lt;p&gt;All header information from the original structure has been preserved.&amp;nbsp; This includes information about chains and residues no longer present in the file.&lt;/p&gt; &lt;p&gt;All ligands and waters associated with the chain have been preserved without filtering.&amp;nbsp; Robust ligand filtering is beyond the scope of this dataset.&amp;nbsp; Trust the ligands at your own discretion.&lt;/p&gt; &lt;p&gt;Sidechain atoms beyond CB have not been considered in the filtering.&amp;nbsp; However, all sidechains have been included for residues that passed the mainchain filters.&amp;nbsp; DO NOT use this set of files for serious questions involving sidechains.&amp;nbsp; See our all-atom filtered dataset instead.&lt;/p&gt; &lt;p&gt;&lt;br&gt; Filtering criteria: Chain level&lt;br&gt; --------------------------------------------------------------------------------&lt;br&gt; Chain is protein&lt;br&gt; Released on or before Dec 31, 2018&lt;br&gt; Resolution &amp;lt; 2.0&lt;br&gt; MolProbity Score &amp;lt; 2.0&lt;br&gt; &amp;lt;3% residues have cbeta deviations&lt;br&gt; &amp;lt;2% residues have covalent bond length outliers&lt;br&gt; &amp;lt;2% residues have covalent bond geometry outliers&lt;/p&gt; &lt;p&gt;Using sequence homology clusters provided by the RCSB PDB, for each homology cluster, the chain with the best (lowest) average of Resolution and MolProbity Score was selected.&lt;/p&gt; &lt;p&gt;&lt;br&gt; Filtering criteria: Residue level&lt;br&gt; --------------------------------------------------------------------------------&lt;br&gt; Even good structures may contain poorly-resolved regions.&amp;nbsp; Residue-level filtering helps avoid including these regions in otherwise high-quality data&lt;/p&gt; &lt;p&gt;Mainchain atoms are defined as N, CA, C, O, CB.&lt;br&gt; Note that CB is included, since its ideal position is defined by the other mainchan atoms.&lt;/p&gt; &lt;p&gt;All mainchain atoms in a residue:&lt;br&gt; Bfactor &amp;lt;= 40&lt;br&gt; Real-space correlation coefficient (rscc) &amp;gt;= 0.7&lt;br&gt; 2Fo-Fc map value &amp;gt;= 1.2&lt;/p&gt; &lt;p&gt;Additionally, residues are not allowed to have:&lt;br&gt; Covalent geometry outliers&lt;br&gt; Steric overlaps or &amp;quot;clashes&amp;quot;, as per Probe&lt;br&gt; Alternate conformations&lt;/p&gt; &lt;p&gt;&lt;br&gt; Chain Completeness criteria&lt;br&gt; --------------------------------------------------------------------------------&lt;br&gt; Chains which lost &amp;gt;40% of their residues during filtering were dropped from this dataset.&amp;nbsp; All chains present here are at least 60% complete.&lt;/p&gt;</dct:description>
    <dct:accessRights rdf:resource="http://publications.europa.eu/resource/authority/access-right/PUBLIC"/>
    <dct:accessRights>
      <dct:RightsStatement rdf:about="info:eu-repo/semantics/openAccess">
        <rdfs:label>Open Access</rdfs:label>
      </dct:RightsStatement>
    </dct:accessRights>
    <dcat:distribution>
      <dcat:Distribution>
        <dct:license rdf:resource="https://creativecommons.org/licenses/by/4.0/legalcode"/>
        <dcat:accessURL rdf:resource="https://doi.org/10.5281/zenodo.4626150"/>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL rdf:resource="https://doi.org/10.5281/zenodo.4626150"/>
        <dcat:byteSize>3898</dcat:byteSize>
        <dcat:downloadURL rdf:resource="https://zenodo.org/record/4626150/files/README"/>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL rdf:resource="https://doi.org/10.5281/zenodo.4626150"/>
        <dcat:byteSize>1448266962</dcat:byteSize>
        <dcat:downloadURL rdf:resource="https://zenodo.org/record/4626150/files/top2018_mc_filtered_pdbs.tar.gz"/>
        <dcat:mediaType>application/x-tar</dcat:mediaType>
      </dcat:Distribution>
    </dcat:distribution>
  </rdf:Description>
</rdf:RDF>
254
81
views
downloads
All versions This version
Views 254207
Downloads 8130
Data volume 25.9 GB18.8 GB
Unique views 203175
Unique downloads 5121

Share

Cite as