Conference paper Open Access

Automatic Part-of-Speech Tagging for Security Vulnerability Descriptions

Yitagesu, Sofonias; Zhang, Xiaowang; Feng, Zhiyong; Li, Xiaohong; Xing, Zhenchang


JSON Export

{
  "files": [
    {
      "links": {
        "self": "https://zenodo.org/api/files/04632802-4446-4a20-8262-8ea6374f4f4d/Automatic%20Part-of-Speech%20Tagging%20for%20SVD.pdf"
      }, 
      "checksum": "md5:60927d17dae7e1a37f3331cb1081f68d", 
      "bucket": "04632802-4446-4a20-8262-8ea6374f4f4d", 
      "key": "Automatic Part-of-Speech Tagging for SVD.pdf", 
      "type": "pdf", 
      "size": 1377678
    }
  ], 
  "owners": [
    205201
  ], 
  "doi": "10.5281/zenodo.4632063", 
  "stats": {
    "version_unique_downloads": 226.0, 
    "unique_views": 296.0, 
    "views": 322.0, 
    "version_views": 322.0, 
    "unique_downloads": 226.0, 
    "version_unique_views": 296.0, 
    "volume": 359573958.0, 
    "version_downloads": 261.0, 
    "downloads": 261.0, 
    "version_volume": 359573958.0
  }, 
  "links": {
    "doi": "https://doi.org/10.5281/zenodo.4632063", 
    "conceptdoi": "https://doi.org/10.5281/zenodo.4632062", 
    "bucket": "https://zenodo.org/api/files/04632802-4446-4a20-8262-8ea6374f4f4d", 
    "conceptbadge": "https://zenodo.org/badge/doi/10.5281/zenodo.4632062.svg", 
    "html": "https://zenodo.org/record/4632063", 
    "latest_html": "https://zenodo.org/record/4632063", 
    "badge": "https://zenodo.org/badge/doi/10.5281/zenodo.4632063.svg", 
    "latest": "https://zenodo.org/api/records/4632063"
  }, 
  "conceptdoi": "10.5281/zenodo.4632062", 
  "created": "2021-03-23T19:35:49.684243+00:00", 
  "updated": "2021-03-24T00:27:30.816364+00:00", 
  "conceptrecid": "4632062", 
  "revision": 2, 
  "id": 4632063, 
  "metadata": {
    "access_right_category": "success", 
    "doi": "10.5281/zenodo.4632063", 
    "description": "<p>Abstract&mdash;In this paper, we study the problem of part-of-speech (POS) tagging for security vulnerability descriptions (SVD). In<br>\ncontrast to newswire articles, SVD often contains a high-level natural language description of the text composed of mixed<br>\nlanguage studded with codes, domain-specific jargon, vague language, and abbreviations. Moreover, training data dedicated<br>\nto security vulnerability research is not widely available. Existing neural network-based POS tagging has often relied on manually<br>\nannotated training data or applying natural language processing (NLP) techniques, suffering from two significant drawbacks. The<br>\nformer is extremely time-consuming and requires labor-intensive feature engineering and expertise. The latter is inadequate to<br>\nidentify linguistically-informed words specific to the SVD domain. In this paper, we propose an automatic approach to assign POS<br>\ntags to tokens in SVD. Our approach uses the character-level representation to automatically extract orthographic features and<br>\nunsupervised word embeddings to capture meaningful syntactic and semantic regularities from SVD. The character level representations are then concatenated with the word embedding as a combined feature, which is then learned and used to predict<br>\nthe POS tagging. To deal with the issue of the poor availability of annotated security vulnerability data, we implement a finetuning approach. Our approach provides public access to a POS annotated corpus of &sim;8M tokens, which serves as a training dataset in this domain. Our evaluation results show a significant improvement in accuracy (17.72%-28.22%) of POS tagging in SVD over the current approaches.</p>", 
    "contributors": [], 
    "title": "Automatic Part-of-Speech Tagging for Security Vulnerability Descriptions", 
    "license": {
      "id": "CC-BY-4.0"
    }, 
    "relations": {
      "version": [
        {
          "count": 1, 
          "index": 0, 
          "parent": {
            "pid_type": "recid", 
            "pid_value": "4632062"
          }, 
          "is_last": true, 
          "last_child": {
            "pid_type": "recid", 
            "pid_value": "4632063"
          }
        }
      ]
    }, 
    "language": "eng", 
    "keywords": [
      "Fine-Tuning, Part-of-Speech tagging, Unsupervised word embedding, Security vulnerability descriptions"
    ], 
    "publication_date": "2021-03-23", 
    "creators": [
      {
        "affiliation": "Tianjin University, China", 
        "name": "Yitagesu, Sofonias"
      }, 
      {
        "affiliation": "Tianjin University, China", 
        "name": "Zhang, Xiaowang"
      }, 
      {
        "affiliation": "Tianjin University, China", 
        "name": "Feng, Zhiyong"
      }, 
      {
        "affiliation": "Tianjin University, China", 
        "name": "Li, Xiaohong"
      }, 
      {
        "affiliation": "Australian National University, Australia", 
        "name": "Xing, Zhenchang"
      }
    ], 
    "meeting": {
      "acronym": "MSR 2021", 
      "dates": "17-19 May 2021", 
      "title": "The 2021 IEEE/ACM 18th International Conference on Mining Software Repositories"
    }, 
    "access_right": "open", 
    "resource_type": {
      "subtype": "conferencepaper", 
      "type": "publication", 
      "title": "Conference paper"
    }, 
    "related_identifiers": [
      {
        "scheme": "doi", 
        "identifier": "10.5281/zenodo.4632062", 
        "relation": "isVersionOf"
      }
    ]
  }
}
322
261
views
downloads
All versions This version
Views 322322
Downloads 261261
Data volume 359.6 MB359.6 MB
Unique views 296296
Unique downloads 226226

Share

Cite as