Conference paper Open Access

Using weakly aligned score–audio pairs to train deep chroma models for cross-modal music retrieval

Frank Zalkow; Meinard Müller

JSON Export

  "files": [
      "links": {
        "self": ""
      "checksum": "md5:2cbe5c2a931796e4c4ec43019e66e339", 
      "bucket": "9f865a3a-a51e-4dc4-a391-5f9a566dce22", 
      "key": "23.pdf", 
      "type": "pdf", 
      "size": 831590
  "owners": [
  "doi": "10.5281/zenodo.4245400", 
  "stats": {
    "version_unique_downloads": 45.0, 
    "unique_views": 115.0, 
    "views": 128.0, 
    "version_views": 128.0, 
    "unique_downloads": 45.0, 
    "version_unique_views": 115.0, 
    "volume": 42411090.0, 
    "version_downloads": 51.0, 
    "downloads": 51.0, 
    "version_volume": 42411090.0
  "links": {
    "doi": "", 
    "conceptdoi": "", 
    "bucket": "", 
    "conceptbadge": "", 
    "html": "", 
    "latest_html": "", 
    "badge": "", 
    "latest": ""
  "conceptdoi": "10.5281/zenodo.4245399", 
  "created": "2020-11-05T01:04:11.779081+00:00", 
  "updated": "2020-11-06T00:27:02.373625+00:00", 
  "conceptrecid": "4245399", 
  "revision": 3, 
  "id": 4245400, 
  "metadata": {
    "access_right_category": "success", 
    "part_of": {
      "pages": "184-191", 
      "title": "Proceedings of the 21st International Society for Music Information Retrieval Conference"
    "doi": "10.5281/zenodo.4245400", 
    "description": "Many music information retrieval tasks involve the comparison of a symbolic score representation with an audio recording. A typical strategy is to compare score\u2013audio pairs based on a common mid-level representation, such as chroma features. Several recent studies demonstrated the effectiveness of deep learning models that learn task-specific mid-level representations from temporally aligned training pairs. However, in practice, there is often a lack of strongly aligned training data, in particular for real-world scenarios. In our study, we use weakly aligned score\u2013audio pairs for training, where only the beginning and end of a score excerpt is annotated in an audio recording, without aligned correspondences in between. To exploit such weakly aligned data, we employ the Connectionist Temporal Classification (CTC) loss to train a deep learning model for computing an enhanced chroma representation. We then apply this model to a cross-modal retrieval task, where we aim at finding relevant audio recordings of Western classical music, given a short monophonic musical theme in symbolic notation as a query. We present systematic experiments that show the effectiveness of the CTC-based model for this theme-based retrieval task.", 
    "license": {
      "id": "CC-BY-4.0"
    "title": "Using weakly aligned score\u2013audio pairs to train deep chroma models for cross-modal music retrieval", 
    "relations": {
      "version": [
          "count": 1, 
          "index": 0, 
          "parent": {
            "pid_type": "recid", 
            "pid_value": "4245399"
          "is_last": true, 
          "last_child": {
            "pid_type": "recid", 
            "pid_value": "4245400"
    "imprint": {
      "publisher": "ISMIR", 
      "place": "Montreal, Canada"
    "communities": [
        "id": "ismir"
    "publication_date": "2020-10-11", 
    "creators": [
        "name": "Frank Zalkow"
        "name": "Meinard M\u00fcller"
    "meeting": {
      "acronym": "ISMIR 2020", 
      "url": "", 
      "dates": "October 11-16, 2020", 
      "place": "Montreal, Canada", 
      "title": "International Society for Music Information Retrieval Conference"
    "access_right": "open", 
    "resource_type": {
      "subtype": "conferencepaper", 
      "type": "publication", 
      "title": "Conference paper"
    "related_identifiers": [
        "scheme": "doi", 
        "identifier": "10.5281/zenodo.4245399", 
        "relation": "isVersionOf"
All versions This version
Views 128128
Downloads 5151
Data volume 42.4 MB42.4 MB
Unique views 115115
Unique downloads 4545


Cite as