{
  "access": {
    "embargo": {
      "active": false,
      "reason": null
    },
    "files": "public",
    "record": "public",
    "status": "open"
  },
  "created": "2026-04-21T12:55:47.696938+00:00",
  "custom_fields": {
    "code:codeRepository": "https://github.com/RobotStudyCompanion/Benchmark_LM",
    "code:developmentStatus": {
      "id": "active",
      "title": {
        "en": "Active"
      }
    },
    "code:programmingLanguage": [
      {
        "id": "python",
        "title": {
          "en": "Python"
        }
      },
      {
        "id": "shell",
        "title": {
          "en": "Shell"
        }
      }
    ],
    "meeting:meeting": {
      "acronym": "ARSO",
      "dates": "10 - 12 June, 2026",
      "place": "Vienna, Austria",
      "title": "22nd IEEE International Conference on Advanced Robotics and its Social Impact"
    }
  },
  "deletion_status": {
    "is_deleted": false,
    "status": "P"
  },
  "files": {
    "count": 1,
    "default_preview": "supplemental_materials_arso2026.zip",
    "enabled": true,
    "entries": {
      "supplemental_materials_arso2026.zip": {
        "access": {
          "hidden": false
        },
        "checksum": "md5:6a3032839b2af18e3b5c78d9974a1bda",
        "ext": "zip",
        "id": "cc342e94-f568-4a84-a4d9-181ab9242647",
        "key": "supplemental_materials_arso2026.zip",
        "links": {
          "content": "https://zenodo.org/api/records/19643021/files/supplemental_materials_arso2026.zip/content",
          "self": "https://zenodo.org/api/records/19643021/files/supplemental_materials_arso2026.zip"
        },
        "metadata": {},
        "mimetype": "application/zip",
        "size": 2102377,
        "storage_class": "L"
      }
    },
    "order": [],
    "total_bytes": 2102377
  },
  "id": "19643021",
  "is_draft": false,
  "is_published": true,
  "links": {
    "access": "https://zenodo.org/api/records/19643021/access",
    "access_grants": "https://zenodo.org/api/records/19643021/access/grants",
    "access_links": "https://zenodo.org/api/records/19643021/access/links",
    "access_request": "https://zenodo.org/api/records/19643021/access/request",
    "access_users": "https://zenodo.org/api/records/19643021/access/users",
    "archive": "https://zenodo.org/api/records/19643021/files-archive",
    "archive_media": "https://zenodo.org/api/records/19643021/media-files-archive",
    "communities": "https://zenodo.org/api/records/19643021/communities",
    "communities-suggestions": "https://zenodo.org/api/records/19643021/communities-suggestions",
    "doi": "https://doi.org/10.5281/zenodo.19643021",
    "draft": "https://zenodo.org/api/records/19643021/draft",
    "file_modification": "https://zenodo.org/api/records/19643021/file-modification",
    "files": "https://zenodo.org/api/records/19643021/files",
    "latest": "https://zenodo.org/api/records/19643021/versions/latest",
    "latest_html": "https://zenodo.org/records/19643021/latest",
    "media_files": "https://zenodo.org/api/records/19643021/media-files",
    "parent": "https://zenodo.org/api/records/19643020",
    "parent_doi": "https://doi.org/10.5281/zenodo.19643020",
    "parent_doi_html": "https://zenodo.org/doi/10.5281/zenodo.19643020",
    "parent_html": "https://zenodo.org/records/19643020",
    "preview_html": "https://zenodo.org/records/19643021?preview=1",
    "quota_increase": "https://zenodo.org/api/records/19643021/quota-increase",
    "request_deletion": "https://zenodo.org/api/records/19643021/request-deletion",
    "requests": "https://zenodo.org/api/records/19643021/requests",
    "reserve_doi": "https://zenodo.org/api/records/19643021/draft/pids/doi",
    "self": "https://zenodo.org/api/records/19643021",
    "self_doi": "https://doi.org/10.5281/zenodo.19643021",
    "self_doi_html": "https://zenodo.org/doi/10.5281/zenodo.19643021",
    "self_html": "https://zenodo.org/records/19643021",
    "self_iiif_manifest": "https://zenodo.org/api/iiif/record:19643021/manifest",
    "self_iiif_sequence": "https://zenodo.org/api/iiif/record:19643021/sequence/default",
    "versions": "https://zenodo.org/api/records/19643021/versions"
  },
  "media_files": {
    "count": 0,
    "enabled": false,
    "entries": {},
    "order": [],
    "total_bytes": 0
  },
  "metadata": {
    "creators": [
      {
        "affiliations": [
          {
            "id": "edmo:3138",
            "identifiers": [
              {
                "identifier": "edmo:3138",
                "scheme": "edmo"
              }
            ],
            "name": "University of Tartu"
          },
          {
            "name": "ECAM LaSalle"
          }
        ],
        "person_or_org": {
          "family_name": "Lamouille",
          "given_name": "Dorian",
          "name": "Lamouille, Dorian",
          "type": "personal"
        },
        "role": {
          "id": "datacollector",
          "title": {
            "de": "DatensammlerIn",
            "en": "Data collector"
          }
        }
      },
      {
        "affiliations": [
          {
            "id": "edmo:3138",
            "identifiers": [
              {
                "identifier": "edmo:3138",
                "scheme": "edmo"
              }
            ],
            "name": "University of Tartu"
          }
        ],
        "person_or_org": {
          "family_name": "Zorec",
          "given_name": "Matev\u017e Borjan",
          "identifiers": [
            {
              "identifier": "0009-0001-3334-9378",
              "scheme": "orcid"
            }
          ],
          "name": "Zorec, Matev\u017e Borjan",
          "type": "personal"
        },
        "role": {
          "id": "datacurator",
          "title": {
            "de": "DatenkuratorIn",
            "en": "Data curator"
          }
        }
      },
      {
        "affiliations": [
          {
            "id": "03z77qz90",
            "identifiers": [
              {
                "identifier": "03z77qz90",
                "scheme": "ror"
              },
              {
                "identifier": "grid.10939.32",
                "scheme": "grid"
              },
              {
                "identifier": "0000 0001 0943 7661",
                "scheme": "isni"
              }
            ],
            "name": "University of Tartu"
          }
        ],
        "person_or_org": {
          "family_name": "Baksh",
          "given_name": "Farnaz",
          "identifiers": [
            {
              "identifier": "0009-0009-8362-7696",
              "scheme": "orcid"
            }
          ],
          "name": "Baksh, Farnaz",
          "type": "personal"
        }
      },
      {
        "affiliations": [
          {
            "id": "edmo:3138",
            "identifiers": [
              {
                "identifier": "edmo:3138",
                "scheme": "edmo"
              }
            ],
            "name": "University of Tartu"
          }
        ],
        "person_or_org": {
          "family_name": "Kruusam\u00e4e",
          "given_name": "Karl",
          "identifiers": [
            {
              "identifier": "0000-0002-1720-1509",
              "scheme": "orcid"
            }
          ],
          "name": "Kruusam\u00e4e, Karl",
          "type": "personal"
        }
      }
    ],
    "description": "<p>This record accompanies the paper \"Benchmarking Local Language Models for Social Robots using Edge Devices\" [accepted IEEE ARSO 2026] and contains the raw benchmark data, MMLU scores, automated teaching-effectiveness ratings, human rater sheets, and the analysis notebook supporting the results reported therein.</p>\n<h2>Overview</h2>\n<p>We benchmarked 25 open-source language models for local deployment on edge hardware in a social-educational robotics context (the Robot Study Companion project, <a href=\"https://rsc.ee\">rsc.ee</a>). Each model was evaluated across three dimensions &mdash; inference efficiency (tokens per second, energy consumption), general knowledge (a six-category MMLU subset), and teaching effectiveness (LLM-rated pedagogical quality validated against five independent human raters) &mdash; primarily on the Raspberry Pi 4, with scalability comparisons on the Raspberry Pi 5 and a laptop NVIDIA RTX 4060 GPU.</p>\n<p>This record contains: per-query hardware telemetry across all three platforms, per-model MMLU scores, GPT-4o-mini teaching-effectiveness ratings, five human rater workbooks, and the notebook that computes inter-rater agreement statistics. Readers should consult the paper for methodology, results, and discussion; this record serves as the underlying evidentiary base.</p>\n<h2>Contents</h2>\n<pre><code>.\n\u251c\u2500\u2500 benchmarks/\n\u2502   \u251c\u2500\u2500 benchmarks_merged.csv                 727-row consolidated per-query telemetry across all platforms\n\u2502   \u251c\u2500\u2500 results_pi4/                          7 CSV files, per-query benchmarks on Raspberry Pi 4\n\u2502   \u251c\u2500\u2500 results_pi5/                          3 CSV files, per-query benchmarks on Raspberry Pi 5\n\u2502   \u2514\u2500\u2500 results_computer/                     24 CSV files, per-query benchmarks on laptop GPU<br>\u2502   \u2514\u2500\u2500 fig1_final.pdf                        Figure 1 (benchmark summary)\n\u251c\u2500\u2500 MMLU/\n\u2502   \u251c\u2500\u2500 MMLU_merged.csv                       25-row merged MMLU results (model tags harmonised with benchmarks)\n\u2502   \u2514\u2500\u2500 models_MMLU_scores/                   25 paired CSV+JSON files, per-model MMLU results\n\u2514\u2500\u2500 rated_teaching/\n    \u251c\u2500\u2500 human_ratings_merged.csv              200-row merged human rater workbook data, deblinded\n    \u251c\u2500\u2500 teaching_effectiveness_ratings/       GPT-4o-mini per-response ratings (250 rows)\n    \u2514\u2500\u2500 human_rate_gpt4o/                     5 rater workbooks + analysis\n        \u251c\u2500\u2500 annotation_analysis.ipynb         human-rater analysis (&alpha;, ICC, Pearson r, Fig 2)\n        \u2514\u2500\u2500 figure_2_human_annotation.pdf     Figure 2 (human annotation validation)</code></pre>\n<h2>Data dictionary</h2>\n<h3><code>benchmarks/results_*/benchmark_all_models_*.csv</code></h3>\n<p>Per-query records from benchmark runs. Each row captures one model answering one question, with full response text and hardware telemetry. Filenames encode the run timestamp: <code>benchmark_all_models_YYYYMMDD_HHMMSS.csv</code>.</p>\n<p>Shared columns (all platforms, 18 fields):</p>\n<table>\n<tbody>\n<tr class=\"header\">\n<th>Column</th>\n<th>Description</th>\n</tr>\n</tbody>\n<tbody>\n<tr class=\"odd\">\n<td><code>timestamp</code></td>\n<td>ISO-8601 start of inference</td>\n</tr>\n<tr class=\"even\">\n<td><code>model</code></td>\n<td>Ollama model tag (e.g. <code>qwen3:0.6b</code>)</td>\n</tr>\n<tr class=\"odd\">\n<td><code>model_parameters</code></td>\n<td>Nominal parameter count, as reported</td>\n</tr>\n<tr class=\"even\">\n<td><code>question</code></td>\n<td>Benchmark prompt text (see Table I in paper)</td>\n</tr>\n<tr class=\"odd\">\n<td><code>response</code></td>\n<td>Full model-generated answer (not truncated)</td>\n</tr>\n<tr class=\"even\">\n<td><code>response_length_chars</code></td>\n<td>Character count of the response</td>\n</tr>\n<tr class=\"odd\">\n<td><code>estimated_tokens</code></td>\n<td>Token count estimated from streaming chunks (see caveat)</td>\n</tr>\n<tr class=\"even\">\n<td><code>inference_time_s</code></td>\n<td>Total generation time, via <code>time.time()</code></td>\n</tr>\n<tr class=\"odd\">\n<td><code>time_to_first_token_s</code></td>\n<td>Latency to first streamed chunk, via <code>time.time()</code></td>\n</tr>\n<tr class=\"even\">\n<td><code>tokens_per_second</code></td>\n<td>Throughput</td>\n</tr>\n<tr class=\"odd\">\n<td><code>cpu_baseline_percent</code>, <code>cpu_average_percent</code></td>\n<td>CPU load (pre-inference baseline; average during inference)</td>\n</tr>\n<tr class=\"even\">\n<td><code>cpu_per_core</code></td>\n<td>Per-core utilisation, stringified Python list; parse with <code>ast.literal_eval</code></td>\n</tr>\n<tr class=\"odd\">\n<td><code>cpu_freq_mhz</code></td>\n<td>CPU frequency during inference</td>\n</tr>\n<tr class=\"even\">\n<td><code>memory_baseline_mb</code>, <code>memory_peak_mb</code>, <code>memory_increase_mb</code>, <code>memory_percent</code></td>\n<td>RAM telemetry</td>\n</tr>\n</tbody>\n</table>\n<p>Raspberry Pi-only additional columns (19 fields, present in <code>results_pi4/</code> and <code>results_pi5/</code>):</p>\n<table>\n<tbody>\n<tr class=\"header\">\n<th>Column</th>\n<th>Description</th>\n</tr>\n</tbody>\n<tbody>\n<tr class=\"odd\">\n<td><code>temperature_c</code></td>\n<td>CPU die temperature (<code>vcgencmd measure_temp</code>)</td>\n</tr>\n<tr class=\"even\">\n<td><code>throttled</code></td>\n<td>Throttling flag (<code>vcgencmd get_throttled</code>)</td>\n</tr>\n<tr class=\"odd\">\n<td><code>avg_voltage_v</code></td>\n<td>Mean rail voltage during inference (<code>vcgencmd measure_volts</code>)</td>\n</tr>\n<tr class=\"even\">\n<td><code>estimated_current_a</code></td>\n<td>Current estimate from linear CPU-load model (idle 0.6A, full-load 3A at 5V)</td>\n</tr>\n<tr class=\"odd\">\n<td><code>avg_power_watts</code></td>\n<td>V &times; A, averaged over inference</td>\n</tr>\n<tr class=\"even\">\n<td><code>total_energy_joules</code></td>\n<td>Estimated energy consumption during inference</td>\n</tr>\n<tr class=\"odd\">\n<td><code>tokens_per_joule</code></td>\n<td>Energy efficiency metric (<code>estimated_tokens / total_energy_joules</code>)</td>\n</tr>\n<tr class=\"even\">\n<td><code>io_read_count</code>, <code>io_write_count</code>, <code>io_read_bytes</code>, <code>io_write_bytes</code>, <code>io_read_time_ms</code>, <code>io_write_time_ms</code>, <code>io_total_ops</code>, <code>io_total_bytes</code>, <code>io_iops</code>, <code>io_throughput_mb_s</code>, <code>io_avg_read_latency_ms</code>, <code>io_avg_write_latency_ms</code></td>\n<td>Disk I/O telemetry</td>\n</tr>\n</tbody>\n</table>\n<p>Laptop-only additional columns (4 fields, present in <code>results_computer/</code> only):</p>\n<table>\n<tbody>\n<tr class=\"header\">\n<th>Column</th>\n<th>Description</th>\n</tr>\n</tbody>\n<tbody>\n<tr class=\"odd\">\n<td><code>inference_time_perf_s</code></td>\n<td>Generation time via <code>time.perf_counter()</code></td>\n</tr>\n<tr class=\"even\">\n<td><code>time_to_first_token_perf_s</code></td>\n<td>TTFT via <code>time.perf_counter()</code></td>\n</tr>\n<tr class=\"odd\">\n<td><code>tokens_per_second_perf</code></td>\n<td>Throughput computed from <code>perf_counter</code> timings</td>\n</tr>\n<tr class=\"even\">\n<td><code>timing_diff_ms</code></td>\n<td>Precision difference between <code>time.time()</code> and <code>time.perf_counter()</code></td>\n</tr>\n</tbody>\n</table>\n<h3><code>benchmarks/benchmarks_merged.csv</code></h3>\n<p>Consolidated per-query telemetry across all three platforms. 727 rows (250 Pi 4 + 237 Pi 5 + 240 laptop) &times; 42 columns (41 raw columns unioned across platforms, plus a <code>platform</code> identifier prepended). One row per (model, platform, question); the <code>qwen3:1.7b</code> laptop double-run contributes 20 rows rather than 10 (see caveats).</p>\n<p>Columns: <code>platform</code> (<code>rpi4</code> / <code>rpi5</code> / <code>laptop</code>), followed by all columns documented above. Pi-only columns fill as NaN on laptop rows; laptop-only <code>*_perf</code> columns fill as NaN on Pi rows. No values are transformed; the file is a pure union merge of the per-session CSVs with platform provenance added.</p>\n<h3><code>MMLU/models_MMLU_scores/{model}_MMLU.csv</code> and <code>.json</code></h3>\n<p>One-row-per-model aggregate MMLU results on the six-category subset used in the paper. Both formats preserved: CSV flattens per-task scores into columns (<code>score_{task}</code>); JSON preserves the native <code>task_scores</code> dict structure.</p>\n<table>\n<tbody>\n<tr class=\"header\">\n<th>Column</th>\n<th>Description</th>\n</tr>\n</tbody>\n<tbody>\n<tr class=\"odd\">\n<td><code>model_name</code></td>\n<td>Ollama model tag</td>\n</tr>\n<tr class=\"even\">\n<td><code>overall_score</code></td>\n<td>Mean accuracy across the six categories (0&ndash;1 scale)</td>\n</tr>\n<tr class=\"odd\">\n<td><code>tasks</code></td>\n<td>Comma-separated task list</td>\n</tr>\n<tr class=\"even\">\n<td><code>num_tasks</code></td>\n<td>Task count (always 6)</td>\n</tr>\n<tr class=\"odd\">\n<td><code>n_shots</code></td>\n<td>Prompting shots (always 3)</td>\n</tr>\n<tr class=\"even\">\n<td><code>timestamp</code></td>\n<td>Run timestamp</td>\n</tr>\n<tr class=\"odd\">\n<td><code>status</code></td>\n<td><code>success</code> or failure marker</td>\n</tr>\n<tr class=\"even\">\n<td>\n<p><code>score_formal_logic</code>, <code>score_global_facts</code>,&nbsp;<code>score_college_computer_science</code>, <code>score_college_mathematics</code>, <code>score_marketing</code>, <code>score_high_school_macroeconomics</code></p>\n</td>\n<td>Per-task accuracy (0&ndash;1)</td>\n</tr>\n</tbody>\n</table>\n<h3><code>MMLU/MMLU_merged.csv</code></h3>\n<p>25-row summary aggregating the per-model files above; underlies Table II's MMLU column and Table III. Columns:&nbsp;<code>model</code>, <code>overall_score</code>, <code>n_shots</code>, <code>status</code>, <code>timestamp</code>, and the six per-task <code>score_*</code> columns. Model tags harmonised with the benchmarks dataset &mdash; the upstream per-model file <code>nemotron-mini_MMLU.csv</code> appears here as <code>nemotron-mini:4b</code> to match the Ollama canonical form used elsewhere in the record.</p>\n<h3><code>rated_teaching/teaching_effectiveness_ratings/teaching_effectiveness_ratings.csv</code></h3>\n<p>Per-response GPT-4o-mini ratings (250 rows = 25 models &times; 10 questions). Ratings produced via the OpenAI API; the rating prompt appears in our paper under &sect;III-B.</p>\n<table>\n<tbody>\n<tr class=\"header\">\n<th>Column</th>\n<th>Description</th>\n</tr>\n</tbody>\n<tbody>\n<tr class=\"odd\">\n<td><code>model</code>, <code>model_parameters</code></td>\n<td>Model identifiers</td>\n</tr>\n<tr class=\"even\">\n<td><code>question</code></td>\n<td>Benchmark prompt</td>\n</tr>\n<tr class=\"odd\">\n<td><code>response_preview</code></td>\n<td>First 200 characters of the model response (full text in the benchmark CSVs)</td>\n</tr>\n<tr class=\"even\">\n<td><code>score</code></td>\n<td>Teaching-effectiveness rating, 1&ndash;10 scale</td>\n</tr>\n<tr class=\"odd\">\n<td><code>strengths</code>, <code>weaknesses</code></td>\n<td>Stringified lists of rater-identified strengths and weaknesses</td>\n</tr>\n<tr class=\"even\">\n<td><code>justification</code></td>\n<td>One- or two-sentence rationale</td>\n</tr>\n<tr class=\"odd\">\n<td><code>tokens_per_second</code>, <code>inference_time_s</code></td>\n<td>Carried over from benchmark run for joint analysis</td>\n</tr>\n<tr class=\"even\">\n<td><code>error</code></td>\n<td>Populated when rating JSON failed to parse (score then defaults to 0)</td>\n</tr>\n</tbody>\n</table>\n<h3><code>rated_teaching/human_rate_gpt4o/teaching_eval_{pseudonym}.xlsx</code></h3>\n<p>One workbook per rater; rater pseudonyms: <code>bird</code>, <code>duck</code>, <code>sky</code>, <code>squirrel</code>, <code>tree</code>. Each workbook contains three sheets:</p>\n<ul>\n<li><strong>Instructions</strong> &mdash; participant information, ethics statement, consent block</li>\n<li><strong>Responses</strong> &mdash; 40 rows (4 models &times; 10 questions) &times; 8 teaching-quality criteria on a 1&ndash;10 scale, plus an optional Comments column</li>\n<li><strong>GPT Scores (DO NOT VIEW)</strong> &mdash; GPT-4o-mini scores for the same 40 responses, intended to be consulted only after annotation</li>\n</ul>\n<p>Raters received model responses blinded via A/B/C/D labels (A=Gemma3 0.27B, B=Gemma3 1B, C=Granite4 Tiny Hybrid 7B, D=Mistral 7B). No personally identifiable data was retained; the contact email in the consent block is the principal contact above.</p>\n<h3><code>rated_teaching/human_ratings_merged.csv</code></h3>\n<p>200-row merged dataset aggregating all five rater workbooks (5 raters &times; 4 models &times; 10 questions). One row per (rater, model, question). Models are deblinded to their canonical Ollama tags; the A/B/C/D labels visible to raters are dropped.</p>\n<table>\n<tbody>\n<tr class=\"header\">\n<th>Column</th>\n<th>Description</th>\n</tr>\n</tbody>\n<tbody>\n<tr class=\"odd\">\n<td><code>rater</code></td>\n<td>Rater pseudonym: <code>bird</code>, <code>duck</code>, <code>sky</code>, <code>squirrel</code>, or <code>tree</code></td>\n</tr>\n<tr class=\"even\">\n<td><code>model</code></td>\n<td>Ollama model tag (deblinded from A/B/C/D)</td>\n</tr>\n<tr class=\"odd\">\n<td><code>question_num</code></td>\n<td>Question index (1&ndash;10)</td>\n</tr>\n<tr class=\"even\">\n<td><code>question</code></td>\n<td>Benchmark prompt (full text)</td>\n</tr>\n<tr class=\"odd\">\n<td><code>response_preview</code></td>\n<td>Truncated model response as shown to the rater</td>\n</tr>\n<tr class=\"even\">\n<td><code>clarity</code>, <code>accuracy</code>, <code>engagement</code>, <code>structure</code>, <code>completeness</code>, <code>appropriate_level</code>, <code>examples_analogies</code>, <code>actionable</code></td>\n<td>Per-criterion rating, 1&ndash;10 scale</td>\n</tr>\n<tr class=\"odd\">\n<td><code>mean_score</code></td>\n<td>Arithmetic mean across the eight criteria</td>\n</tr>\n<tr class=\"even\">\n<td><code>comments</code></td>\n<td>Rater's free-text comment, if any (otherwise NaN)</td>\n</tr>\n</tbody>\n</table>\n<h3><code>annotation_analysis.ipynb</code></h3>\n<p>Jupyter notebook computing the statistics reported in paper &sect;IV-E: Krippendorff's &alpha;, ICC(C,1) and ICC(C,k), Pearson r, mean absolute difference. Generates Figure 2.</p>\n<h2>Methodology</h2>\n<table>\n<tbody>\n<tr>\n<th><strong>Metric</strong></th>\n<th><strong>Description</strong></th>\n<th><strong>Type of Metrics</strong></th>\n</tr>\n<tr>\n<td>Token Per Second (TPS)</td>\n<td>Number of tokens generated by the model in one second (average based on the total generation time divided by the total number of tokens).</td>\n<td>Hardware</td>\n</tr>\n<tr>\n<td>Inference time</td>\n<td>Total time spent generating one output (time between the query being input to the model and the last generated token).</td>\n<td>Hardware</td>\n</tr>\n<tr>\n<td>Time To First Token (TTFT)</td>\n<td>Time spent to generate the first token of the output.</td>\n<td>Hardware</td>\n</tr>\n<tr>\n<td>Response length</td>\n<td>Total number of characters for an output.</td>\n<td>Hardware</td>\n</tr>\n<tr>\n<td>IOPS</td>\n<td>Input and output memory operations.</td>\n<td>Hardware</td>\n</tr>\n<tr>\n<td>Token Per Joule (TPJ)</td>\n<td>Number of tokens generated by the model per joule of energy consumed.</td>\n<td>Hardware</td>\n</tr>\n<tr>\n<td>\n<p>Massive Multitask Language Understanding (MMLU)</p>\n</td>\n<td>Benchmark used to assess the general knowledge of a model across multiple topics.</td>\n<td>Accuracy</td>\n</tr>\n<tr>\n<td>\n<p>Teaching effectiveness</p>\n</td>\n<td>Rating of the output based on teaching criteria on a scale from 1 to 10 (judged by a larger LLM: GPT-4o-mini).</td>\n<td>Accuracy</td>\n</tr>\n<tr>\n<td>Human rater&nbsp;</td>\n<td>Rating of the output based on the same eight teaching criteria on a 1&ndash;10 scale (judged by five independent human raters on a four-model subset; 200 annotations total).</td>\n<td>Accuracy</td>\n</tr>\n</tbody>\n</table>\n<p>For full methodology, please consult paper &sect;III. In brief:</p>\n<ul>\n<li><strong>MMLU subset</strong> covers Formal Logic, Global Facts, College Computer Science, College Mathematics, Marketing, and High School Macroeconomics (1,050 questions total). DeepEval [16] orchestrates 3-shot prompting at temperature 0.1. The evaluation runs on an NVIDIA A100 (40GB) via University of Tartu HPC, since MMLU accuracy is hardware-agnostic.</li>\n<li><strong>Inference benchmarks</strong> use Ollama with streaming enabled on the target platform. Ten pedagogical questions per model (Table I in paper) cover explanatory depth, adaptability, misconception handling, and student guidance. Models above 1.4B parameters receive a structured system prompt; models below receive the raw question only, owing to sensitivity to prompt formatting at small parameter counts.</li>\n<li><strong>Teaching-effectiveness ratings</strong> use GPT-4o-mini against eight criteria (clarity, accuracy, engagement, structure, completeness, appropriate level, examples/analogies, actionable). The rating prompt appears verbatim in paper &sect;III-B.</li>\n<li><strong>Human validation</strong> (paper &sect;IV-E) covers four representative models &times; 10 questions &times; 5 raters = 200 annotations across the same eight criteria.</li>\n</ul>\n<h2>Known caveats and divergences</h2>\n<ul>\n<li><strong>Raspberry Pi 5 scope.</strong> Paper &sect;III-D reports scalability on <em>three</em> Raspberry Pi 5 models (qwen3:0.6b, gemma3:1b, granite4:tiny-h), selected on TPS/MMLU/size criteria from the RPi 4 results for Robot Study Companion architecture planning. This record contains Pi 5 data for <strong>24 of the 25 models</strong> across three sessions (2025-12-01, 2025-12-02, 2025-12-10). <code>granite4:1b-h</code> is the sole model not covered on the Pi 5. Users may analyse the broader Pi 5 dataset; the paper's three-model subset remains the analytical focus cited therein.</li>\n<li><strong>Teaching-rating parse failures.</strong> Three rows in <code>teaching_effectiveness_ratings.csv</code> carry <code>score=0</code> and <code>error=\"Failed to parse rating\"</code>: nemotron-mini:4b (question 9), phi4-mini-reasoning:3.8b (question 2), tinyllama:latest (question 3). Their <code>response_preview</code> fields contain the model output that GPT-4o-mini could not parse into the expected JSON rubric. These three are excluded from the per-model teaching-effectiveness summary used in paper Table II.</li>\n<li><strong>Nemotron-mini MMLU format violations.</strong> The <code>nemotron-mini_MMLU.csv</code> file records a 0% aggregate due to repeated output-format violations; see Table II's &Dagger; footnote. The file is retained for completeness and reproducibility of the failure mode.</li>\n<li><strong>Granite4 1B quantisation note.</strong> Paper &sect;IV-B flags that <code>granite4:1b</code> ships in BF16 (not Q4_K_M), yielding an atypical 3.3 GB on-disk footprint and 0.89 TPS &mdash; values not directly comparable to its Q4-quantised peers.</li>\n<li><strong>Single-run hardware metrics.</strong> Runtime constraints on the RPi 4 precluded multiple independent runs per model. All hardware metrics (TPS, TPJ, inference time, etc.) are single-run values; thermal drift may introduce unquantified variance into absolute values while preserving relative rankings.</li>\n<li><strong>Token counting.</strong> TPS and related metrics use streamed-chunk counts rather than tokeniser-level token counts. This may introduce minor discrepancies across runtimes but does not affect relative comparisons within the benchmark.</li>\n<li><strong>Power estimation uncertainty.</strong> Absolute tokens-per-joule values carry an estimated &plusmn;15&ndash;20% uncertainty owing to the linear CPU-load-to-current approximation (idle 0.6A, full-load 3A at 5V nominal). Relative rankings remain stable.</li>\n<li><strong>Reasoning-model throughput.</strong> DeepSeek-R1 (1.5B, 7B) and Phi4-mini-reasoning (3.8B) emit internal chain-of-thought tokens in the runtime stream. Their throughput and latency metrics may appear optimistic relative to the useful output delivered.</li>\n<li><strong>Laptop coverage.</strong> The laptop dataset covers 23 of the 25 models; <code>granite4:1b-h</code> and <code>granite4:3b-h</code> were not run on the laptop. Including this gap, full-platform coverage is: Pi 4 all 25, Pi 5 24 (missing <code>granite4:1b-h</code>), laptop 23 (missing both hybrid variants).</li>\n<li><strong>qwen3:1.7b double-run on laptop.</strong> <code>qwen3:1.7b</code> was run twice on the laptop (sessions 2025-11-19 15:26 and 15:29); both runs are preserved in <code>results_computer/</code> and in <code>benchmarks_merged.csv</code>. Rows distinguish via <code>timestamp</code>. All other (model, platform) pairs have a single run.</li>\n<li><strong>falcon3:3b degenerate generations on Pi 4.</strong> falcon3:3b on Pi 4 produced a one-character response to question 10 (\"I am strugulling with C++ where should i start?\") and a zero-character response to question 8 (\"what are the steps to write a master thesis?\"). Both rows have NaN power and energy metrics (<code>vcgencmd</code> sampler did not fire during these runs). The question-8 row carries <code>tokens_per_second</code> &asymp; 0.2, depressing falcon3:3b's Pi 4 throughput aggregate; exclude via <code>response_length_chars &gt; 0</code> when computing per-model means.</li>\n<li><strong>GPT-4o-mini hallucinated scores for degenerate responses.</strong> Following on from the falcon3:3b Pi 4 runs above, GPT-4o-mini rated question 8 at 7/10 and question 10 at 6/10 with no error flags and empty or near-empty <code>response_preview</code> fields. These rows inflate falcon3:3b's teaching aggregate; exclude via <code>response_length_chars &gt; 0</code> if joining with benchmarks for analysis. Combined with the three parse-failures above, the total known GPT-rating anomalies stand at five.</li>\n</ul>\n<h2>Reproducibility</h2>\n<ul>\n<li><strong>Benchmark scripts.</strong> Available at <a href=\"https://github.com/RobotStudyCompanion/Benchmark_LM/releases/tag/v0.1\">https://github.com/RobotStudyCompanion/Benchmark_LM/releases/tag/v0.1</a></li>\n<li><strong>Analysis notebook.</strong> <code>rated_teaching/human_rate_gpt4o/annotation_analysis.ipynb</code> at the root of this record.</li>\n<li><strong>Hardware.</strong> Raspberry Pi 4 Model B (8GB), Raspberry Pi 5 (8GB), and a laptop with NVIDIA RTX 4060 GPU. Pi Lite OS (64-bit, 2025-11-24, kernel v6.12, Debian 12 bookworm). MMLU evaluation performed on NVIDIA Tesla A100 (40GB) via University of Tartu HPC services.</li>\n</ul>",
    "funding": [
      {
        "award": {
          "number": "PRG3237"
        },
        "funder": {
          "id": "00jjeja18",
          "name": "Estonian Research Council"
        }
      }
    ],
    "publication_date": "2026-04-18",
    "publisher": "Zenodo",
    "related_identifiers": [
      {
        "identifier": "https://github.com/RobotStudyCompanion/Benchmark_LM/releases/tag/v0.1",
        "relation_type": {
          "id": "isdocumentedby",
          "title": {
            "de": "Wird dokumentiert von",
            "en": "Is documented by"
          }
        },
        "resource_type": {
          "id": "software",
          "title": {
            "de": "Software",
            "en": "Software"
          }
        },
        "scheme": "other"
      }
    ],
    "resource_type": {
      "id": "dataset",
      "title": {
        "de": "Datensatz",
        "en": "Dataset"
      }
    },
    "rights": [
      {
        "description": {
          "en": "The Creative Commons Attribution license allows re-distribution and re-use of a licensed work on the condition that the creator is appropriately credited."
        },
        "icon": "cc-by-icon",
        "id": "cc-by-4.0",
        "props": {
          "scheme": "spdx",
          "url": "https://creativecommons.org/licenses/by/4.0/legalcode"
        },
        "title": {
          "en": "Creative Commons Attribution 4.0 International"
        }
      }
    ],
    "title": "Supplemental materials to \"Benchmarking Local Language Models for Social Robots using Edge Devices\""
  },
  "parent": {
    "access": {
      "owned_by": {
        "user": "1430337"
      },
      "settings": {
        "accept_conditions_text": null,
        "allow_guest_requests": false,
        "allow_user_requests": false,
        "secret_link_expiration": 0
      }
    },
    "communities": {
      "default": "4a41b11b-322a-4d30-b777-7b7ab3a3c527",
      "entries": [
        {
          "access": {
            "member_policy": "closed",
            "members_visibility": "public",
            "record_submission_policy": "open",
            "review_policy": "closed",
            "visibility": "public"
          },
          "children": {
            "allow": false
          },
          "created": "2026-04-17T12:40:55.954598+00:00",
          "custom_fields": {
            "subjects": [
              {
                "id": "mesh:D000083505"
              },
              {
                "id": "euroscivoc:1233"
              },
              {
                "id": "euroscivoc:297"
              },
              {
                "id": "euroscivoc:933"
              },
              {
                "id": "euroscivoc:939"
              },
              {
                "id": "mesh:D018961Q000193"
              },
              {
                "id": "mesh:D004493"
              },
              {
                "id": "mesh:D018907"
              },
              {
                "id": "mesh:D003491"
              }
            ]
          },
          "deletion_status": {
            "is_deleted": false,
            "status": "P"
          },
          "id": "4a41b11b-322a-4d30-b777-7b7ab3a3c527",
          "links": {},
          "metadata": {
            "curation_policy": "<h1>Curation policy</h1>\n<p>Members of the RSC team curate&nbsp;voluntarily; review timelines depend on individual availability.</p>\n<p>We aim to respond to submissions within two weeks.</p>\n<p><strong>Review criteria</strong></p>\n<p>We check each submission against our <a title=\"About/Submission policy\" href=\"https://zenodo.org/communities/rsc/about\" target=\"_blank\" rel=\"noopener\">Submission policy</a>. Curators verify:</p>\n<ul>\n<li>Scope alignment with RSC research and development</li>\n<li>Absence of personal or identifying information</li>\n<li>Documented ethical clearance where applicable</li>\n<li>Appropriate open licensing</li>\n<li>Metadata completeness and reproducibility support</li>\n</ul>\n<p><strong>Decisions</strong></p>\n<p>Submissions are accepted, returned for revision with actionable feedback, or declined if out of scope. Where identifying information is present, curators will request redaction before acceptance rather than decline outright.</p>\n<p><strong>Conflicts of interest</strong></p>\n<p>Where a curator has co-authored a submission, review is handled by another curator where possible.</p>\n<p><strong>Appeals</strong></p>\n<p>Depositors who disagree with a curation decision may request a second review by contacting the community via email at <a href=\"mailto:robotstudycompanion@gmail.com\">robotstudycompanion@gmail[dot]com</a></p>",
            "description": "The Robot[ic] Study Companion (RSC) is an open-source social-educational robot platform. This community curates related research artefacts, including peer-reviewed papers, datasets, code releases, theses, and supplementary materials.",
            "organizations": [
              {
                "id": "edmo:3138"
              },
              {
                "id": "05d8pm274"
              }
            ],
            "page": "<h1>About the Robot[ic] Study Companion community</h1>\n<p>The Robot[ic] Study Companion (RSC) is an open-source social-educational robot platform. The project explores how low-cost, privacy-preserving, and reproducible robotic companions can support university students' learning experiences.</p>\n<p>This Zenodo community archives research-grade artefacts produced by the RSC team and its collaborators. We aim to make the platform's scholarly output, such as papers, datasets, code, theses, and supplementary materials, more discoverable, citable, and openly accessible in one place.</p>\n<h3>Curators</h3>\n<ul>\n<li><strong>Farnaz Baksh</strong>, project lead and originator of the RSC concept. Doctoral researcher investigating social robot study companions. <a href=\"https://orcid.org/0009-0009-8362-769\" target=\"_blank\" rel=\"noopener\">orcid.org/0009-0009-8362-769</a></li>\n<li><strong>Matev\u017e Borjan Zorec</strong>, curator and community manager. Lead developer of the RSC platform. <a href=\"https://orcid.org/0009-0001-3334-9378\" target=\"_blank\" rel=\"noopener\">orcid.org/0009-0001-3334-9378</a></li>\n</ul>\n<h1><strong>Submission policy</strong></h1>\n<p>We accept research-grade artefacts directly supporting RSC research and development: peer-reviewed papers and preprints, datasets, code and software releases, theses, and supplementary materials tied to a clear publication or research trajectory. We do not host informal content such as slide decks, blog posts, working notes, or drafts without a defined research outcome.</p>\n<p><strong>Before you submit</strong></p>\n<p>Please ensure your deposit meets the following requirements. Submissions that do not meet them will be returned with feedback.</p>\n<ol>\n<li><strong>No personal or identifying information.</strong> Remove names, email addresses, institutional IDs, IP addresses, device identifiers, and any free-text fields that could identify participants, raters, or third parties. Pseudonymise where identifiers are essential to the data structure.</li>\n<li><strong>Ethical clearance documented.</strong> Where your work involves human participants, state the approving body and reference number in the record description.</li>\n<li><strong>Licence specified.</strong> Choose an open licence appropriate to the artefact (CC-BY 4.0 is typical for papers and datasets; MIT, Apache-2.0, or GPL for code).</li>\n<li><strong>Metadata complete.</strong> Include authors with ORCIDs where available, a clear title, a descriptive abstract, and relevant keywords.</li>\n<li><strong>Reproducibility supported.</strong> For datasets and code, include a description or README document clearly and concisely describing structure, dependencies, and guidance for reproducing reported results.</li>\n<li><strong>RSC linkage stated.</strong> Briefly note how the deposit relates to the RSC platform or research programme.</li>\n</ol>\n<h2><strong>How to submit</strong></h2>\n<p>Upload your record to Zenodo and, on the submission form, select \"Robot[ic] Study Companion\" from the Communities dropdown. Curators review submissions on a voluntary basis, typically within two weeks depending on availability.</p>\n<p><strong>Review outcomes</strong></p>\n<p>Submissions are accepted, returned for revision with specific feedback, or declined if out of scope. Curators may request redaction of identifying information as a condition of acceptance. Declined submissions may be resubmitted once the issues are addressed.</p>\n<h1>Citing the community</h1>\n<p>Each record in this community has its own DOI and should be cited individually. If you additionally wish to acknowledge the RSC platform itself, for example, when your work builds on the RSC or deposits into this community, we suggest the following.</p>\n<p><strong>Acknowledgement line</strong></p>\n<blockquote>\n<p>Artefact deposited in the Robot[ic] Study Companion community on Zenodo (<a href=\"https://zenodo.org/communities/rsc\">https://zenodo.org/communities/rsc</a>).</p>\n</blockquote>\n<p><strong>Canonical RSC reference</strong></p>\n<blockquote>\n<p>Baksh, F., Zorec, M. B., &amp; Kruusam&auml;e, K. (2024). Open-Source Robotic Study Companion with Multimodal Human&ndash;Robot Interaction to Improve the Learning Experience of University Students. <em>Applied Sciences</em>, 14(13).</p>\n</blockquote>\n<p>We recommend citing this publication when referencing the RSC platform in publications and derived works.</p>\n<h2>Contact</h2>\n<p>For questions about the community or to discuss a potential contribution, visit <a href=\"https://rsc.ee/\">rsc.ee</a> and fill out <a href=\"https://forms.gle/WJYp9mDta3YBHC5Z7\" target=\"_blank\" rel=\"noopener noreferrer\">this quick form</a> \ud83d\udcc4&nbsp;or email us at&nbsp;<a href=\"mailto:robotstudycompanion@gmail.com\">robotstudycompanion@gmail[dot]com</a></p>",
            "title": "Robot[ic] Study Companion",
            "type": {
              "id": "project"
            },
            "website": "https://rsc.ee/"
          },
          "revision_id": 16,
          "slug": "rsc",
          "updated": "2026-04-17T14:55:26.698529+00:00"
        }
      ],
      "ids": [
        "4a41b11b-322a-4d30-b777-7b7ab3a3c527"
      ]
    },
    "id": "19643020",
    "pids": {
      "doi": {
        "client": "datacite",
        "identifier": "10.5281/zenodo.19643020",
        "provider": "datacite"
      }
    }
  },
  "pids": {
    "doi": {
      "client": "datacite",
      "identifier": "10.5281/zenodo.19643021",
      "provider": "datacite"
    },
    "oai": {
      "identifier": "oai:zenodo.org:19643021",
      "provider": "oai"
    }
  },
  "revision_id": 14,
  "stats": {
    "all_versions": {
      "data_volume": 16819016.0,
      "downloads": 8,
      "unique_downloads": 7,
      "unique_views": 53,
      "views": 77
    },
    "this_version": {
      "data_volume": 16819016.0,
      "downloads": 8,
      "unique_downloads": 7,
      "unique_views": 53,
      "views": 77
    }
  },
  "status": "published",
  "swh": {},
  "updated": "2026-04-21T14:00:28.349920+00:00",
  "versions": {
    "index": 1,
    "is_latest": true
  }
}