{
  "access": {
    "embargo": {
      "active": false,
      "reason": null
    },
    "files": "public",
    "record": "public",
    "status": "open"
  },
  "created": "2025-12-02T21:48:18.020989+00:00",
  "custom_fields": {
    "code:developmentStatus": {
      "id": "active",
      "title": {
        "en": "Active"
      }
    },
    "code:programmingLanguage": [
      {
        "id": "python",
        "title": {
          "en": "Python"
        }
      },
      {
        "id": "typescript",
        "title": {
          "en": "TypeScript"
        }
      }
    ]
  },
  "deletion_status": {
    "is_deleted": false,
    "status": "P"
  },
  "files": {
    "count": 1,
    "enabled": true,
    "entries": {
      "KGSum.zip": {
        "access": {
          "hidden": false
        },
        "checksum": "md5:8b89009130b12130bdf25af01b71348c",
        "ext": "zip",
        "id": "42f86186-08c1-4c12-980a-8a6037b5e27d",
        "key": "KGSum.zip",
        "links": {
          "content": "https://zenodo.org/api/records/17795222/files/KGSum.zip/content",
          "self": "https://zenodo.org/api/records/17795222/files/KGSum.zip"
        },
        "metadata": {},
        "mimetype": "application/zip",
        "size": 6872746,
        "storage_class": "L"
      }
    },
    "order": [],
    "total_bytes": 6872746
  },
  "id": "17795222",
  "is_draft": false,
  "is_published": true,
  "links": {
    "access": "https://zenodo.org/api/records/17795222/access",
    "access_grants": "https://zenodo.org/api/records/17795222/access/grants",
    "access_links": "https://zenodo.org/api/records/17795222/access/links",
    "access_request": "https://zenodo.org/api/records/17795222/access/request",
    "access_users": "https://zenodo.org/api/records/17795222/access/users",
    "archive": "https://zenodo.org/api/records/17795222/files-archive",
    "archive_media": "https://zenodo.org/api/records/17795222/media-files-archive",
    "communities": "https://zenodo.org/api/records/17795222/communities",
    "communities-suggestions": "https://zenodo.org/api/records/17795222/communities-suggestions",
    "doi": "https://doi.org/10.5281/zenodo.17795222",
    "draft": "https://zenodo.org/api/records/17795222/draft",
    "file_modification": "https://zenodo.org/api/records/17795222/file-modification",
    "files": "https://zenodo.org/api/records/17795222/files",
    "latest": "https://zenodo.org/api/records/17795222/versions/latest",
    "latest_html": "https://zenodo.org/records/17795222/latest",
    "media_files": "https://zenodo.org/api/records/17795222/media-files",
    "parent": "https://zenodo.org/api/records/17795221",
    "parent_doi": "https://doi.org/10.5281/zenodo.17795221",
    "parent_doi_html": "https://zenodo.org/doi/10.5281/zenodo.17795221",
    "parent_html": "https://zenodo.org/records/17795221",
    "preview_html": "https://zenodo.org/records/17795222?preview=1",
    "quota_increase": "https://zenodo.org/api/records/17795222/quota-increase",
    "request_deletion": "https://zenodo.org/api/records/17795222/request-deletion",
    "requests": "https://zenodo.org/api/records/17795222/requests",
    "reserve_doi": "https://zenodo.org/api/records/17795222/draft/pids/doi",
    "self": "https://zenodo.org/api/records/17795222",
    "self_doi": "https://doi.org/10.5281/zenodo.17795222",
    "self_doi_html": "https://zenodo.org/doi/10.5281/zenodo.17795222",
    "self_html": "https://zenodo.org/records/17795222",
    "self_iiif_manifest": "https://zenodo.org/api/iiif/record:17795222/manifest",
    "self_iiif_sequence": "https://zenodo.org/api/iiif/record:17795222/sequence/default",
    "versions": "https://zenodo.org/api/records/17795222/versions"
  },
  "media_files": {
    "count": 0,
    "enabled": false,
    "entries": {},
    "order": [],
    "total_bytes": 0
  },
  "metadata": {
    "contributors": [
      {
        "affiliations": [
          {
            "id": "0192m2k53",
            "identifiers": [
              {
                "identifier": "0192m2k53",
                "scheme": "ror"
              },
              {
                "identifier": "grid.11780.3f",
                "scheme": "grid"
              },
              {
                "identifier": "0000 0004 1937 0335",
                "scheme": "isni"
              }
            ],
            "name": "University of Salerno"
          }
        ],
        "person_or_org": {
          "family_name": "Tuozzo",
          "given_name": "Gabriele",
          "identifiers": [
            {
              "identifier": "0009-0004-5108-1995",
              "scheme": "orcid"
            }
          ],
          "name": "Tuozzo, Gabriele",
          "type": "personal"
        },
        "role": {
          "id": "researcher",
          "title": {
            "de": "WissenschaftlerIn",
            "en": "Researcher"
          }
        }
      },
      {
        "person_or_org": {
          "family_name": "Pellegrino",
          "given_name": "Maria Angela",
          "identifiers": [
            {
              "identifier": "0000-0001-8927-5833",
              "scheme": "orcid"
            }
          ],
          "name": "Pellegrino, Maria Angela",
          "type": "personal"
        },
        "role": {
          "id": "researcher",
          "title": {
            "de": "WissenschaftlerIn",
            "en": "Researcher"
          }
        }
      }
    ],
    "creators": [
      {
        "affiliations": [
          {
            "id": "0192m2k53",
            "identifiers": [
              {
                "identifier": "0192m2k53",
                "scheme": "ror"
              },
              {
                "identifier": "grid.11780.3f",
                "scheme": "grid"
              },
              {
                "identifier": "0000 0004 1937 0335",
                "scheme": "isni"
              }
            ],
            "name": "University of Salerno"
          }
        ],
        "person_or_org": {
          "family_name": "Mario Cosenza",
          "identifiers": [
            {
              "identifier": "0009-0000-8813-5521",
              "scheme": "orcid"
            }
          ],
          "name": "Mario Cosenza",
          "type": "personal"
        }
      }
    ],
    "description": "<h1><strong>KGSum: Automatic Classification and Profiling of&nbsp;Knowledge Graphs</strong></h1>\n<p><strong>KgSum</strong> is a Python application for extracting, preparing, and classifying Knowledge Graphs (KGs). It combines Large Language Models (such as Mistral Instructor 7B with QLoRA) and traditional machine learning for effective graph classification and profiling.</p>\n<div dir=\"auto\">\n<h2>Getting Started</h2>\n</div>\n<p dir=\"auto\">Follow these steps to set up KgSum locally.</p>\n<div dir=\"auto\">\n<h3>Prerequisites</h3>\n</div>\n<div dir=\"auto\">\n<h4>For Local Machine Learning Backend:</h4>\n</div>\n<ul>\n<li><strong>Miniconda</strong>&nbsp;(required)</li>\n<li><strong>Python 3.12</strong>&nbsp;(suggested)</li>\n<li><strong>CUDA 12.8</strong>&nbsp;(for transformer models like Mistral)</li>\n<li><strong>NVIDIA GPU</strong>&nbsp;(recommended: RTX 3070 or higher)</li>\n</ul>\n<div dir=\"auto\">\n<h4>For Frontend:</h4>\n</div>\n<ul>\n<li><strong>Node.js</strong></li>\n<li><strong>npm</strong></li>\n</ul>\n<div dir=\"auto\">\n<h4>For Docker Deployment:</h4>\n</div>\n<ul>\n<li><strong>Docker</strong></li>\n<li><strong>Docker Compose</strong></li>\n</ul>\n<div dir=\"auto\">\n<h3>Installation</h3>\n</div>\n<div dir=\"auto\">\n<h4>Local Setup (Machine Learning Backend)</h4>\n</div>\n<ol>\n<li>\n<p dir=\"auto\">Clone the repository:</p>\n<div dir=\"auto\">\n<pre>git clone https://github.com/mariocosenza/kgsum.git\ncd kgsum</pre>\n<div>&nbsp;</div>\n</div>\n</li>\n<li>\n<p dir=\"auto\">Create and activate conda environment:</p>\n<div dir=\"auto\">\n<pre>conda env create -f environment.yml\nconda activate kgsum</pre>\n<div>&nbsp;</div>\n</div>\n</li>\n<li>\n<p dir=\"auto\"><strong>For GPU/Transformer Models (Mistral):</strong></p>\n<ul>\n<li>Comment out CUDA libraries in&nbsp;<code>environment.yml</code></li>\n<li>Change TensorFlow version to GPU-compatible version as suggested in comments</li>\n</ul>\n</li>\n</ol>\n<div dir=\"auto\">\n<h4>Frontend Setup</h4>\n</div>\n<ol>\n<li>\n<p dir=\"auto\">Install dependencies:</p>\n<div dir=\"auto\">\n<pre>npm install</pre>\n<div>&nbsp;</div>\n</div>\n</li>\n<li>\n<p dir=\"auto\">Run the frontend:</p>\n<div dir=\"auto\">\n<pre>npm run dev</pre>\n<div>&nbsp;</div>\n</div>\n</li>\n<li>\n<p dir=\"auto\"><strong>For GraphDB embedding visualization:</strong></p>\n<ul>\n<li>Replace GraphDB's&nbsp;<code>security-config.xml</code>&nbsp;with the one in&nbsp;<code>/docker/graphdb</code></li>\n</ul>\n</li>\n</ol>\n<div dir=\"auto\">\n<h3>Configuration</h3>\n</div>\n<div dir=\"auto\">\n<h4>Environment Variables</h4>\n</div>\n<p dir=\"auto\">Set the following environment variables in your shell:</p>\n<div dir=\"auto\">\n<pre>export GEMINI_API_KEY=your_gemini_api_key_here\nexport LOCAL_ENDPOINT_LOV=http://your-local-endpoint\nexport LOCAL_ENDPOINT=http://your-local-endpoint\nexport SECRET_KEY=your_secret_key_here\nexport UPLOAD_FOLDER=/path/to/uploads\nexport UPLOAD=true\nexport NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY=your_clerk_publishable_key\nexport CLASSIFICATION_API_URL=http://localhost:5000\nexport GITHUB_TOKEN=your_github_token_here</pre>\n<div>&nbsp;</div>\n</div>\n<div dir=\"auto\">\n<h4>Backend Configuration</h4>\n</div>\n<p dir=\"auto\">Configure the backend by editing&nbsp;<code>config.json</code>:</p>\n<div dir=\"auto\">\n<pre>{\n  \"labeling\" : {\n    \"use_gemini\": false,\n    \"search_zenodo\": true,\n    \"search_github\": true,\n    \"search_lod_cloud\": true,\n    \"stop_before_merging\": false\n  },\n  \"extraction\": {\n    \"start_offset\": 0,\n    \"step_numbers\": 10,\n    \"step_range\": 16,\n    \"extract_sparql\": true,\n    \"query_lov\": false\n  },\n  \"processing\" : {\n    \"use_ner\": false,\n    \"use_filter\": true\n  },\n  \"training\" : {\n     \"classifier\": \"NAIVE_BAYES\",\n     \"feature\": [\"CURI\", \"PURI\", \"LAB\", \"CON\", \"TLDS\", \"VOC\", \"LCN\", \"LPN\", \"DSC\", \"SBJ\"],\n     \"oversample\": true,\n     \"max_token\": 36000,\n     \"use_tfidf_autoencoder\": true\n  },\n  \"profile\": {\n    \"store_profile_after_training\": false,\n    \"base_domain\": \"http://www.isislab.it\"\n  },\n  \"general_settings\": {\n    \"info\": \"Possible classifiers: SVM, NAIVE_BAYES, KNN, J48, MISTRAL, MLP, DEEP, BATCHNORM, Phase: LABELING, EXTRACTION, PROCESSING, TRAINING, STORE\",\n    \"start_phase\": \"labeling\",\n    \"stop_phase\": \"training\",\n    \"allow_upload\": true\n  }\n}</pre>\n<div>&nbsp;</div>\n</div>\n<p dir=\"auto\"><strong>Available Classifiers:</strong>&nbsp;SVM, NAIVE_BAYES, KNN, J48, MISTRAL, MLP, DEEP, BATCHNORM<br><strong>Available Features:</strong>&nbsp;CURI, PURI<br><strong>Processing Phases:</strong>&nbsp;LABELING, EXTRACTION, PROCESSING, TRAINING, STORE</p>\n<p dir=\"auto\">(<a href=\"https://github.com/isislab-unisa/KGSum/tree/no-clerk#readme-top\">back to top</a>)</p>\n<div dir=\"auto\">\n<h2>Usage</h2>\n</div>\n<div dir=\"auto\">\n<h3>Training Process</h3>\n</div>\n<div dir=\"auto\">\n<h4>Full Training Pipeline</h4>\n</div>\n<p dir=\"auto\">Run the complete training process from extraction to model training:</p>\n<div dir=\"auto\">\n<pre>python train.py</pre>\n<div>&nbsp;</div>\n</div>\n<div dir=\"auto\">\n<h4>Individual Script Training</h4>\n</div>\n<p dir=\"auto\">For more fine-tuned control, run individual scripts in&nbsp;<code>/src</code>:</p>\n<div dir=\"auto\">\n<pre># Run scripts in /src directory for specific phases</pre>\n<div>&nbsp;</div>\n</div>\n<div dir=\"auto\">\n<h3>Running the Application</h3>\n</div>\n<div dir=\"auto\">\n<h4>Local Flask Server</h4>\n</div>\n<p dir=\"auto\">After completing training, start the WSGI Flask server on port 5000:</p>\n<div dir=\"auto\">\n<pre>python app.py</pre>\n<div>&nbsp;</div>\n</div>\n<div dir=\"auto\">\n<h4>Prerequisites for Complete Profiling</h4>\n</div>\n<ul>\n<li><strong>Linked Open Vocabularies (LOV) instance</strong>&nbsp;is required for complete profiling and initial data extraction</li>\n</ul>\n<div dir=\"auto\">\n<h3>API Usage</h3>\n</div>\n<p dir=\"auto\">Send POST requests to:</p>\n<ul>\n<li><code>/api/v1/profile/sparql</code></li>\n<li><code>/api/v1/profile/file</code></li>\n</ul>\n<p dir=\"auto\">Refer to the Swagger documentation for detailed request and response formats.</p>\n<p dir=\"auto\">(<a href=\"https://github.com/isislab-unisa/KGSum/tree/no-clerk#readme-top\">back to top</a>)</p>\n<div dir=\"auto\">\n<h2>Docker Deployment</h2>\n</div>\n<div dir=\"auto\">\n<h3>Quick Setup with Pre-trained Model</h3>\n</div>\n<p dir=\"auto\">For a simpler deployment using the pre-trained Naive Bayes model:</p>\n<ol>\n<li>\n<p dir=\"auto\">Navigate to the docker directory:</p>\n<div dir=\"auto\">\n<pre>cd /docker</pre>\n<div>&nbsp;</div>\n</div>\n</li>\n<li>\n<p dir=\"auto\">Fill the&nbsp;<code>.env</code>&nbsp;file with your configuration</p>\n</li>\n<li>\n<p dir=\"auto\">Run with Docker Compose:</p>\n<div dir=\"auto\">\n<pre>docker-compose up</pre>\n<div>&nbsp;</div>\n</div>\n</li>\n</ol>\n<div dir=\"auto\">\n<h3>Individual Docker Services</h3>\n</div>\n<p dir=\"auto\">Three individual Dockerfiles are provided for custom deployments:</p>\n<ul>\n<li><strong>Backend</strong>&nbsp;service</li>\n<li><strong>Frontend</strong>&nbsp;service</li>\n<li><strong>GraphDB</strong>&nbsp;configuration</li>\n</ul>\n<div dir=\"auto\">\n<h3>Hardware Requirements</h3>\n</div>\n<div dir=\"auto\">\n<h4>Tested Configuration</h4>\n</div>\n<table>\n<thead>\n<tr>\n<th>Component</th>\n<th>Specification</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>CPU</td>\n<td>AMD Ryzen 5800x</td>\n</tr>\n<tr>\n<td>RAM</td>\n<td>32 GB DDR4 3600MHz</td>\n</tr>\n<tr>\n<td>GPU</td>\n<td>NVIDIA RTX 3070</td>\n</tr>\n</tbody>\n</table>\n<div dir=\"auto\">\n<h4>Recommended Configuration</h4>\n</div>\n<table>\n<thead>\n<tr>\n<th>Component</th>\n<th>Specification</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>RAM</td>\n<td>64+ GB (larger size suggested)</td>\n</tr>\n<tr>\n<td>GPU</td>\n<td>High-performance GPU for better LLM performance</td>\n</tr>\n</tbody>\n</table>\n<p dir=\"auto\">(<a href=\"https://github.com/isislab-unisa/KGSum/tree/no-clerk#readme-top\">back to top</a>)</p>\n<div dir=\"auto\">\n<h2>Roadmap</h2>\n</div>\n<ul>\n<li>&nbsp;Add Swagger API documentation</li>\n<li>&nbsp;Expand coverage for more LLMs</li>\n<li>&nbsp;Improve Docker deployment documentation</li>\n<li>&nbsp;Add more dataset preparation examples</li>\n<li>&nbsp;Add performance optimization guides</li>\n<li>&nbsp;Enhance frontend visualization features</li>\n</ul>\n<p dir=\"auto\">See the&nbsp;<a href=\"https://github.com/mariocosenza/kgsum/issues\">open issues</a>&nbsp;for a full list of proposed features (and known issues).</p>\n<p dir=\"auto\">(<a href=\"https://github.com/isislab-unisa/KGSum/tree/no-clerk#readme-top\">back to top</a>)</p>\n<div dir=\"auto\">\n<h2>Contributing</h2>\n</div>\n<p dir=\"auto\">Contributions are what make the open source community such an amazing place to learn, inspire, and create. Any contributions you make are&nbsp;<strong>greatly appreciated</strong>.</p>\n<p dir=\"auto\">If you have a suggestion that would make this better, please fork the repo and create a pull request. You can also simply open an issue with the tag \"enhancement\". Don't forget to give the project a star! Thanks again!</p>\n<ol>\n<li>Fork the Project</li>\n<li>Create your Feature Branch (<code>git checkout -b feature/AmazingFeature</code>)</li>\n<li>Commit your Changes (<code>git commit -m 'Add some AmazingFeature'</code>)</li>\n<li>Push to the Branch (<code>git push origin feature/AmazingFeature</code>)</li>\n<li>Open a Pull Request</li>\n</ol>",
    "publication_date": "2025-12-02",
    "publisher": "Zenodo",
    "resource_type": {
      "id": "software",
      "title": {
        "de": "Software",
        "en": "Software"
      }
    },
    "rights": [
      {
        "description": {
          "en": "A short and simple permissive license with conditions only requiring preservation of copyright and license notices. Licensed works, modifications, and larger works may be distributed under different terms and without source code."
        },
        "id": "mit",
        "props": {
          "scheme": "spdx",
          "url": "https://opensource.org/licenses/MIT"
        },
        "title": {
          "en": "MIT License"
        }
      }
    ],
    "title": "KGSum: Automatic Classification and Profiling of Knowledge Graphs"
  },
  "parent": {
    "access": {
      "owned_by": {
        "user": "544196"
      },
      "settings": {
        "accept_conditions_text": null,
        "allow_guest_requests": false,
        "allow_user_requests": false,
        "secret_link_expiration": 0
      }
    },
    "communities": {
      "default": "d8d2099c-2f66-4ddc-9dd7-447c5214730b",
      "entries": [
        {
          "access": {
            "member_policy": "open",
            "members_visibility": "public",
            "record_submission_policy": "closed",
            "review_policy": "open",
            "visibility": "public"
          },
          "children": {
            "allow": false
          },
          "created": "2024-04-08T08:05:13.406433+00:00",
          "custom_fields": {
            "subjects": [
              {
                "id": "mesh:D003196"
              },
              {
                "id": "euroscivoc:47"
              },
              {
                "id": "mesh:D003198"
              },
              {
                "id": "euroscivoc:301"
              },
              {
                "id": "mesh:D000077488"
              }
            ]
          },
          "deletion_status": {
            "is_deleted": false,
            "status": "P"
          },
          "id": "d8d2099c-2f66-4ddc-9dd7-447c5214730b",
          "links": {},
          "metadata": {
            "description": "The research laboratory ISISLab is active since 1995 and is devoted to distributed and parallel computing, extended reality and open data",
            "organizations": [
              {
                "name": "Universit\u00e0 di Salerno"
              }
            ],
            "title": "ISISLab, Dipartimento di Informatica, Universit\u00e0 degli Studi di Salerno (Italy)",
            "website": "http://www.isislab.it"
          },
          "revision_id": 7,
          "slug": "isislab",
          "updated": "2025-04-29T17:39:53.225474+00:00"
        }
      ],
      "ids": [
        "d8d2099c-2f66-4ddc-9dd7-447c5214730b"
      ]
    },
    "id": "17795221",
    "pids": {
      "doi": {
        "client": "datacite",
        "identifier": "10.5281/zenodo.17795221",
        "provider": "datacite"
      }
    }
  },
  "pids": {
    "doi": {
      "client": "datacite",
      "identifier": "10.5281/zenodo.17795222",
      "provider": "datacite"
    },
    "oai": {
      "identifier": "oai:zenodo.org:17795222",
      "provider": "oai"
    }
  },
  "revision_id": 6,
  "stats": {
    "all_versions": {
      "data_volume": 587908368.0,
      "downloads": 44,
      "unique_downloads": 44,
      "unique_views": 143,
      "views": 157
    },
    "this_version": {
      "data_volume": 254291602.0,
      "downloads": 37,
      "unique_downloads": 37,
      "unique_views": 111,
      "views": 122
    }
  },
  "status": "published",
  "swh": {
    "swhid": "swh:1:dir:8a0cfb0dcc735518bdf75ff373e6db7a817af54a;origin=https://doi.org/10.5281/zenodo.17795221;visit=swh:1:snp:5a13070ae56290e5b1e6e1e88f68e11afd4723fa;anchor=swh:1:rel:c8f79e756603b06d799ad59a3b06ee52d7722d28;path=KGSum-no-clerk"
  },
  "updated": "2025-12-02T21:48:59.594410+00:00",
  "versions": {
    "index": 1,
    "is_latest": false
  }
}