Dataset Open Access

High quality protein residues: top2018 mainchain-filtered residues

Williams, Christopher; Richardson, David; Richardson, Jane


JSON Export

{
  "files": [
    {
      "links": {
        "self": "https://zenodo.org/api/files/02472703-87f9-43f4-8d31-7c3b99ab3896/README_mc_filter.txt"
      }, 
      "checksum": "md5:750f8826827c5369a1ce46c1117acb54", 
      "bucket": "02472703-87f9-43f4-8d31-7c3b99ab3896", 
      "key": "README_mc_filter.txt", 
      "type": "txt", 
      "size": 6807
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/02472703-87f9-43f4-8d31-7c3b99ab3896/sample_file_loop.py"
      }, 
      "checksum": "md5:f542f9a7ce0f4c220da7ba16c430fc6b", 
      "bucket": "02472703-87f9-43f4-8d31-7c3b99ab3896", 
      "key": "sample_file_loop.py", 
      "type": "py", 
      "size": 641
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/02472703-87f9-43f4-8d31-7c3b99ab3896/top2018_chains_hom30_mcfilter_60pct_complete.txt"
      }, 
      "checksum": "md5:aaa1b4dbc1191489ed6e05cf6060fbb5", 
      "bucket": "02472703-87f9-43f4-8d31-7c3b99ab3896", 
      "key": "top2018_chains_hom30_mcfilter_60pct_complete.txt", 
      "type": "txt", 
      "size": 58149
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/02472703-87f9-43f4-8d31-7c3b99ab3896/top2018_chains_hom50_mcfilter_60pct_complete.txt"
      }, 
      "checksum": "md5:8b955c245684bbd76ff23436f79a5445", 
      "bucket": "02472703-87f9-43f4-8d31-7c3b99ab3896", 
      "key": "top2018_chains_hom50_mcfilter_60pct_complete.txt", 
      "type": "txt", 
      "size": 82642
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/02472703-87f9-43f4-8d31-7c3b99ab3896/top2018_chains_hom70_mcfilter_60pct_complete.txt"
      }, 
      "checksum": "md5:4e1d77c173f33ec5f2df9aada2901747", 
      "bucket": "02472703-87f9-43f4-8d31-7c3b99ab3896", 
      "key": "top2018_chains_hom70_mcfilter_60pct_complete.txt", 
      "type": "txt", 
      "size": 95739
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/02472703-87f9-43f4-8d31-7c3b99ab3896/top2018_chains_hom90_mcfilter_60pct_complete.txt"
      }, 
      "checksum": "md5:72bdb96c209ae36b59dd1b4f71361064", 
      "bucket": "02472703-87f9-43f4-8d31-7c3b99ab3896", 
      "key": "top2018_chains_hom90_mcfilter_60pct_complete.txt", 
      "type": "txt", 
      "size": 106274
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/02472703-87f9-43f4-8d31-7c3b99ab3896/top2018_metadata_mc_filtered.csv"
      }, 
      "checksum": "md5:eecc82da39bed07f3d26bdb6d5272c2d", 
      "bucket": "02472703-87f9-43f4-8d31-7c3b99ab3896", 
      "key": "top2018_metadata_mc_filtered.csv", 
      "type": "csv", 
      "size": 2007333
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/02472703-87f9-43f4-8d31-7c3b99ab3896/top2018_passrates_mc_filtered.csv"
      }, 
      "checksum": "md5:9ad257bd0b521aa0cfc1a986d869060f", 
      "bucket": "02472703-87f9-43f4-8d31-7c3b99ab3896", 
      "key": "top2018_passrates_mc_filtered.csv", 
      "type": "csv", 
      "size": 340310
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/02472703-87f9-43f4-8d31-7c3b99ab3896/top2018_pdbs_mc_filtered_hom30.tar.gz"
      }, 
      "checksum": "md5:c80be319fcdc7ca3072a1344af5b7aba", 
      "bucket": "02472703-87f9-43f4-8d31-7c3b99ab3896", 
      "key": "top2018_pdbs_mc_filtered_hom30.tar.gz", 
      "type": "gz", 
      "size": 782122628
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/02472703-87f9-43f4-8d31-7c3b99ab3896/top2018_pdbs_mc_filtered_hom50.tar.gz"
      }, 
      "checksum": "md5:7197acde2313b0a1da84786b0e897bd2", 
      "bucket": "02472703-87f9-43f4-8d31-7c3b99ab3896", 
      "key": "top2018_pdbs_mc_filtered_hom50.tar.gz", 
      "type": "gz", 
      "size": 1140526057
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/02472703-87f9-43f4-8d31-7c3b99ab3896/top2018_pdbs_mc_filtered_hom70.tar.gz"
      }, 
      "checksum": "md5:31ec0d70c296b63891d8e57f8fd546a4", 
      "bucket": "02472703-87f9-43f4-8d31-7c3b99ab3896", 
      "key": "top2018_pdbs_mc_filtered_hom70.tar.gz", 
      "type": "gz", 
      "size": 1330693936
    }, 
    {
      "links": {
        "self": "https://zenodo.org/api/files/02472703-87f9-43f4-8d31-7c3b99ab3896/top2018_pdbs_mc_filtered_hom90.tar.gz"
      }, 
      "checksum": "md5:fd1671aa7e88e49335ff44e87d96883c", 
      "bucket": "02472703-87f9-43f4-8d31-7c3b99ab3896", 
      "key": "top2018_pdbs_mc_filtered_hom90.tar.gz", 
      "type": "gz", 
      "size": 1465922465
    }
  ], 
  "owners": [
    204887
  ], 
  "doi": "10.5281/zenodo.5777651", 
  "stats": {
    "version_unique_downloads": 653.0, 
    "unique_views": 817.0, 
    "views": 966.0, 
    "version_views": 1419.0, 
    "unique_downloads": 547.0, 
    "version_unique_views": 1184.0, 
    "volume": 182414975984.0, 
    "version_downloads": 1035.0, 
    "downloads": 879.0, 
    "version_volume": 231049408825.0
  }, 
  "links": {
    "doi": "https://doi.org/10.5281/zenodo.5777651", 
    "conceptdoi": "https://doi.org/10.5281/zenodo.4626149", 
    "bucket": "https://zenodo.org/api/files/02472703-87f9-43f4-8d31-7c3b99ab3896", 
    "conceptbadge": "https://zenodo.org/badge/doi/10.5281/zenodo.4626149.svg", 
    "html": "https://zenodo.org/record/5777651", 
    "latest_html": "https://zenodo.org/record/5777651", 
    "badge": "https://zenodo.org/badge/doi/10.5281/zenodo.5777651.svg", 
    "latest": "https://zenodo.org/api/records/5777651"
  }, 
  "conceptdoi": "10.5281/zenodo.4626149", 
  "created": "2021-12-13T21:17:38.674605+00:00", 
  "updated": "2021-12-14T01:48:41.762859+00:00", 
  "conceptrecid": "4626149", 
  "revision": 2, 
  "id": 5777651, 
  "metadata": {
    "access_right_category": "success", 
    "doi": "10.5281/zenodo.5777651", 
    "description": "<p>Introduction<br>\n--------------------------------------------------------------------------------<br>\nThis directory contains files from the top2018 dataset by the Richardson Lab at Duke University.</p>\n\n<p>These are high-quality residues from high-quality, low redundancy protein chains in the PDB.</p>\n\n<p>This dataset is quality-filtered on mainchain atoms.&nbsp; For the full-residue filtered set, see https://doi.org/10.5281/zenodo.5115232</p>\n\n<p>The accompanying publication is:<br>\nWilliams, C. J., Richardson, D. C., &amp; Richardson, J. S. (2021). The importance of residue\u2010level filtering, and the Top2018 best\u2010parts dataset of high\u2010quality protein residues. Protein Science. http://doi.org/10.1002/pro.4239</p>\n\n<p>Usage recommendations<br>\n--------------------------------------------------------------------------------<br>\nProtein residues that fail the filtering criteria described below have been removed from the files.&nbsp; As a result, these files can be considered pre-filtered and will return only results for residues of good model quality with supporting experimental data.&nbsp; As long as the question concerns mainchain protein atoms, these files should be usable as is.&nbsp; There is a separate version that has been filtered on all atoms that is suitable for sidechains.</p>\n\n<p>The top2018 contains several different levels of homology clustering (30%, 50%, 70%, 90%) to ensure nonredundant datasets.&nbsp; The 70% homology level is a reliable default.&nbsp; These chains are listed in top2018_chains_hom70_mcfilter_60pct_complete.txt and found in top2018_pdbs_mc_filtered_hom70.tar.gz</p>\n\n<p>Files are organized in subdirectories based on the first two letters of their PDB ids.&nbsp; The included python script sample_file_loop.py may aid in accessing the directory structure.</p>\n\n<p>Files already contain hydrogens added by Reduce.&nbsp; NQH flips have been performed to ensure that these are the best versions of these structures.</p>\n\n<p>top2018_metadata_mc_filtered.csv contains information on release date, resolution, and validation scores for each file.</p>\n\n<p>top2018_passrates_mc_filtered.csv contains information on how many protein residues from the original chain passed the quality filters.</p>\n\n<p><br>\nHomology sets:<br>\n--------------------------------------------------------------------------------<br>\nUsing sequence homology clusters provided by the RCSB PDB, for each homology cluster, the best chain was selected for inclusion in the dataset.&nbsp; This ensures minimal sequence/structural redundancy.</p>\n\n<p>The top2018 is available at several different levels of homology clustering, which may be appropriate to different uses.&nbsp; Lists of the included chains at each homology level are included in this distribution.</p>\n\n<p>Lower homology numbers mean less redundancy, but fewer total chains in the dataset.</p>\n\n<p>For general use, ***we recommend the 70% homology set*** as a good balance between inclusivity and variety. This list is given in the file top2018_chains_hom70_mcfilter_60pct_complete.txt</p>\n\n<p><br>\nUsage caveats:<br>\n--------------------------------------------------------------------------------<br>\nThese files are incomplete.&nbsp; They are single chains from structures that may have had multiple chains.&nbsp; Residues that fail the filtering criteria have been removed.&nbsp; Programs with strong requirements for completeness or uninterrupted chains should be used with care.&nbsp; Chain completeness and fragmentation statistics are available in top2018_passrates_mc_filted.csv and in USER records at the end on each .pdb file.</p>\n\n<p>All header information from the original structure has been preserved.&nbsp; This includes information about chains and residues no longer present in the file.</p>\n\n<p>All ligands and waters associated with the chain have been preserved without filtering.&nbsp; Robust ligand filtering is beyond the scope of this dataset.&nbsp; Trust the ligands at your own discretion.</p>\n\n<p>Sidechain atoms beyond CB have not been considered in the filtering.&nbsp; However, all sidechains have been included for residues that passed the mainchain filters.&nbsp; DO NOT use this set of files for serious questions involving sidechains.&nbsp; See our all-atom filtered dataset instead.</p>\n\n<p><br>\nFiltering criteria: Chain level<br>\n--------------------------------------------------------------------------------<br>\nChain is protein<br>\nReleased on or before Dec 31, 2018<br>\nResolution &lt; 2.0<br>\nMolProbity Score &lt; 2.0<br>\n&lt;3% residues have cbeta deviations<br>\n&lt;2% residues have covalent bond length outliers<br>\n&lt;2% residues have covalent bond geometry outliers</p>\n\n<p>Using sequence homology clusters provided by the RCSB PDB, for each homology cluster, the chain with the best (lowest) average of Resolution and MolProbity Score was selected.</p>\n\n<p><br>\nFiltering criteria: Residue level<br>\n--------------------------------------------------------------------------------<br>\nEven excellent structures usually contain some poorly-resolved regions.&nbsp; Residue-level filtering helps avoid including these regions in otherwise high-quality data</p>\n\n<p>Mainchain atoms are defined as N, CA, C, O, CB.<br>\nNote that CB is included, since its ideal position is defined by the other mainchan atoms.</p>\n\n<p>All mainchain atoms in a residue:<br>\nBfactor &lt;= 40<br>\nReal-space correlation coefficient (rscc) &gt;= 0.7<br>\n2Fo-Fc map value &gt;= 1.2</p>\n\n<p>Additionally, residues are not allowed to have:<br>\nCovalent geometry outliers<br>\nSteric overlaps or &quot;clashes&quot;, as per Probe<br>\nAlternate conformations</p>\n\n<p><br>\nChain Completeness criteria<br>\n--------------------------------------------------------------------------------<br>\nChains which lost &gt;40% of their residues during filtering were dropped from this dataset.&nbsp; All chains present here are at least 60% complete.</p>\n\n<p><br>\nFiltering doumentation<br>\n--------------------------------------------------------------------------------<br>\nEach file documents its pruned and included residues with USER records.&nbsp; These include self-documenting USER&nbsp; DOC lines as follow:<br>\nUSER&nbsp; DOC Lines marked with USER&nbsp; DEL list residues pruned by<br>\nUSER&nbsp; DOC quality filtering.<br>\nUSER&nbsp; DOC Format is chain:resseq:icode:reason_for_pruning<br>\nUSER&nbsp; DOC Reasons for pruning are abbreviated as 1-letter codes: bcmgoa<br>\nUSER&nbsp; DOC b=bfactor, c=real space correlation, m=2Fo-Fc mapvalue<br>\nUSER&nbsp; DOC g=geometry outlier, o=steric overlap, a=alternate conformations<br>\nUSER&nbsp; DOC Lines marked USER&nbsp; INC list the uninterrupted fragments of structure<br>\nUSER&nbsp; DOC still included after pruning by quality filtering<br>\nUSER&nbsp; DOC Format is chain1:resseq1:icode1:chain2:resseq2:icode2:fragment_length<br>\nUSER&nbsp; DOC where 1 is the first and 2 the last residue of the fragment<br>\nUSER&nbsp; DOC Line marked with USER&nbsp; PCT gives statistics for structure completeness</p>\n\n<p>Version history<br>\n--------------------------------------------------------------------------------<br>\nVersion 0.9 10.5281/zenodo.4626150 &nbsp;&nbsp; &nbsp;Mar 21, 2021<br>\nInitial version</p>\n\n<p>Version 1.0 10.5281/zenodo.5115075 &nbsp;&nbsp; &nbsp;Jul 19, 2021<br>\nSplit into 30, 50, 70, and 90% homology sets</p>\n\n<p>Version 2.0<br>\nSet case of filenames to unambiguous standard: all lowercase except L</p>\n\n<p>Version 2.01</p>\n\n<p>Added missing chain list for recommended hom70 set</p>", 
    "license": {
      "id": "CC-BY-4.0"
    }, 
    "title": "High quality protein residues: top2018 mainchain-filtered residues", 
    "relations": {
      "version": [
        {
          "count": 4, 
          "index": 3, 
          "parent": {
            "pid_type": "recid", 
            "pid_value": "4626149"
          }, 
          "is_last": true, 
          "last_child": {
            "pid_type": "recid", 
            "pid_value": "5777651"
          }
        }
      ]
    }, 
    "version": "2.01", 
    "publication_date": "2021-07-19", 
    "creators": [
      {
        "orcid": "0000-0002-5808-8768", 
        "affiliation": "Duke University", 
        "name": "Williams, Christopher"
      }, 
      {
        "orcid": "0000-0001-5069-343X", 
        "affiliation": "Duke University", 
        "name": "Richardson, David"
      }, 
      {
        "orcid": "0000-0002-3311-2944", 
        "affiliation": "Duke University", 
        "name": "Richardson, Jane"
      }
    ], 
    "access_right": "open", 
    "resource_type": {
      "type": "dataset", 
      "title": "Dataset"
    }, 
    "related_identifiers": [
      {
        "scheme": "doi", 
        "identifier": "10.5281/zenodo.4626149", 
        "relation": "isVersionOf"
      }
    ]
  }
}
1,419
1,035
views
downloads
All versions This version
Views 1,419966
Downloads 1,035879
Data volume 231.0 GB182.4 GB
Unique views 1,184817
Unique downloads 653547

Share

Cite as