{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ALEX LGBl csv export to BibTeX\n",
"\n",
"This converts a CSV file to a bibtex file and was made for mrp-bib Zotero library (https://www.zotero.org/groups/2042149/mrp-bib/library). \n",
"\n",
"\n",
"Assumption - CSV file has the following column names: \n",
"\n",
"```\n",
"SELECT \n",
"\tid,\t\t\t# lfnr, decimal(23,0)\n",
"\taid,\t\t# kürzel, varchar(3)\n",
"\tjahr,\t\t# jahr, varchar(4)\n",
"\tausdatvf,\t# ausgabedatum als text, varchar(255)\n",
"\tausdatsf,\t# ausgabedatum als yyyymmdd, varchar(8)\n",
"\tgesdatvf,\t# gesetzesdatum als text, varchar(255)\n",
"\tgesdatsf,\t# gesetzesdatum als yyyymmdd, varchar(8)\n",
"\twirdatvf,\t# wirkugsdatum als text, varchar(255)\n",
"\twirdatsf,\t# wirkugsdatum als yyyymmdd, varchar(8)\n",
"\tgestyp,\t\t# typ [verordnung, gesetz,....], varchar(100)\n",
"\tinhalt,\t\t# Gesetz, longtext(65535)\n",
"\tstueck,\t\t# Stück, varchar(50)\n",
"\tstuecknr,\t# Stück als Zahl decimal(22,0), \n",
"\tgesnr,\t\t# GesetzesNr, decimal(22,0)\n",
"\tseite,\t\t# SeitenNr, decimal(22,0)\t\t==> URL:seite=%08d, seite. \n",
"\tgeschzahl, # ?, varchar(100)\n",
"\tabtnr,\t\t# ?, decimal(22,0)\n",
"\tinst,\t\t# ?, longtext(65535)\n",
"\tordner,\t\t# der physikalische unterordner ==> URL:datum=%d%d, jahr, substr(ordner,9,8)\n",
"\tsprache,\t# Sprache [deu, ita, ...], char(3)\n",
"\tschrift,\t# Schrift [L=Latein, K=Kyrillisch], char(1)\n",
"\tgid,\t\t# Gültigkeit [B, H, R], char(1)\n",
"\tbezeichnung,# Bezeichnung, varchar(100)\n",
"\tkurzform, \t# Kurzbezeichnung, varchar(100)\n",
"\tbool_exists\t# es gibt gesetze, da haben wir nicht die entsprechenden seiten, nur das iv.\n",
"FROM annoiv.gesamt_t t\n",
"where gid in ('H','R') \n",
"```"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"import re"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Open and read the export file (csv). "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"input_file = open('alex20210323/alex_20210323.csv', encoding='utf-8')\n",
"input_csv_file = csv.DictReader(input_file, delimiter='\\t') \n",
"data = []\n",
"for row in input_csv_file: \n",
" data.append(row)\n",
"input_file.close()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In case of a KeyError please check the header entries (dict keys) and correct if necessary (e.g. 'Key' or '\\ufeff \"Key\"')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"display(row) "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"display(data[125575])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Define functions to write attributes (add new if necessary)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def title(row):\n",
" if row['inhalt'] not in (None, \"\"):\n",
" return \"\\ttitle = {\" + row['inhalt'] + \"},\\n\"\n",
" else:\n",
" return \"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def year(row):\n",
" if row['ausdatsf'] not in (None, \"\", \"NULL\"):\n",
" return \"\\tdate = {\" + row['ausdatsf'][0:4] + \"-\" + row['ausdatsf'][4:6] + \"-\" + row['ausdatsf'][6:8] + \"},\\n\"\n",
" else:\n",
" return \"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def pages(row):\n",
" if row['seite'] not in (None, \"\"):\n",
" return \"\\tpages = {\" + row['seite'] + \"},\\n\"\n",
" else:\n",
" return \"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def number(row):\n",
" if row['gesnr'] not in (None, \"\"):\n",
" return \"\\tnumber = {\" + row['gesnr'] + \"/\" + row['ausdatsf'][0:4] + \"},\\n\"\n",
" else:\n",
" return \"\" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def volume(row):\n",
" if row['stuecknr'] not in (None, \"\"):\n",
" return \"\\tvolume = {\" + row['stuecknr'] + \"},\\n\"\n",
" else:\n",
" return \"\" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def url(row):\n",
" if row['bool_exists'] not in (None, \"\", \"0\"):\n",
" return \"\\turl = {https://alex.onb.ac.at/cgi-content/alex?aid=\" + row['aid'] + \"&datum=\" + row['ordner'][9:] + \"&seite=\" + row['seite'].zfill(8) + \"},\\n\"\n",
" else:\n",
" return \"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def abstract(row):\n",
" return \"\\tabstract = {Ausgabedatum: \" + row['ausdatsf'] + \"\\nGesetzesdatum: \" + row['gesdatsf'] + \"\\nWirkungsdatum: \" + row['wirdatsf'] + \"},\\n\" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def shorttitle(row):\n",
" if row['kurzform'] not in (None, \"\"):\n",
" return \"\\tshorttitle = {\" + row['kurzform'] + \" Nr. \" + row['gesnr'] + \"/\" + row['ausdatsf'][0:4] + \"},\\n\"\n",
" else:\n",
" return \"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def keywords(row):\n",
" return \"\\tkeywords = {\" + \"LGBl., ALEX\" + \"},\\n\" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def language(row):\n",
" if row['sprache'] not in (None, \"\"):\n",
" return \"\\tlanguage = {\" + row['sprache'] + \"},\\n\"\n",
" else:\n",
" return \"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def bibkey(row):\n",
" return \"@legislation{alexlgbl\" + row['id'] + \",\\n\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## And now for some Zotero RDF"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def title(row):\n",
" if row['inhalt'] not in (None, \"\"):\n",
" return \"\" + row['inhalt'] + \"\\n\"\n",
" else:\n",
" return \"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def year(row):\n",
" if row['ausdatsf'] not in (None, \"\", \"NULL\"):\n",
" return \"\" + row['ausdatsf'][0:4] + \"-\" + row['ausdatsf'][4:6] + \"-\" + row['ausdatsf'][6:8] + \"\\n\"\n",
" elif row['gesdatsf'] not in (None, \"\", \"NULL\"):\n",
" return \"\" + row['gesdatsf'][0:4] + \"-\" + row['gesdatsf'][4:6] + \"-\" + row['gesdatsf'][6:8] + \"\\n\"\n",
" elif row['wirdatsf'] not in (None, \"\", \"NULL\"):\n",
" return \"\" + row['wirdatsf'][0:4] + \"-\" + row['wirdatsf'][4:6] + \"-\" + row['wirdatsf'][6:8] + \"\\n\"\n",
" elif row['ordner'] not in (None, \"\", \"NULL\"):\n",
" return \"\" + row['ordner'][9:13] + \"\\n\"\n",
" else:\n",
" return \"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def pages(row):\n",
" if row['seite'] not in (None, \"\"):\n",
" return \"\" + row['seite'] + \"\\n\"\n",
" else:\n",
" return \"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def number(row):\n",
" if row['gesnr'] not in (None, \"\"):\n",
" if row['ordner'] not in (None, \"NULL\", \"\"):\n",
" return \"\" + row['gesnr'] + \"/\" + row['ordner'][4:8] + \"\\n\"\n",
" elif row['ausdatsf'] not in (None, \"NULL\", \"\"):\n",
" return \"\" + row['gesnr'] + \"/\" + row['ausdatsf'][4:6] + \"\\n\"\n",
" else: \n",
" return \"\"\n",
" else:\n",
" return \"\" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def volume(row):\n",
" return \"\" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def url(row):\n",
" if row['bool_exists'] not in (None, \"\", \"0\"):\n",
" return \"https://alex.onb.ac.at/cgi-content/alex?aid=\" + row['aid'] + \"&datum=\" + row['ordner'][9:] + \"&seite=\" + row['seite'].zfill(8) + \"\\n\"\n",
" else:\n",
" return \"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def abstract(row):\n",
" return \"\" + str(row['stuecknr']) + \". Stück\\nAusgabedatum: \" + str(row['ausdatsf']) + \"\\nGesetzesdatum: \" + str(row['gesdatsf']) + \"\\nWirkungsdatum: \" + str(row['wirdatsf']) + \"\\n\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def shorttitle(row):\n",
" if row['kurzform'] not in (None, \"\"):\n",
" return \"\" + row['kurzform'] + \"\\n\" + \"\" + row['kurzform'] + \" Nr. \" + row['gesnr'] + \"/\" + row['ordner'][4:8] + \"\"\n",
" # return \"\" + row['kurzform'] + \" Nr. \" + row['gesnr'] + \"/\" + row['ordner'][4:8] + \"\\n\"\n",
" # return \"\" + row['kurzform'] + \" Nr. \" + row['gesnr'] + \"/\" + row['ordner'][4:8] + \"\\n\" + \"\" + row['kurzform'] + \" Nr. \" + row['gesnr'] + \"/\" + row['ordner'][4:8] + \"\\n\"\n",
" else:\n",
" return \"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def keywords(row):\n",
" return \"\" + \"Gesetze LGBl., ALEX\" + \"\\n\" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def language(row):\n",
" if row['sprache'] not in (None, \"\"):\n",
" return \"\" + row['sprache'] + \"\\n\"\n",
" else:\n",
" return \"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def bibkey(row):\n",
" if row['bool_exists'] not in (None, \"\", \"0\"):\n",
" return '\\n'\n",
" else:\n",
" return \"\\n\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"A function that calls all attributes. (Just for a better overview.) \n",
"For author and editor, semicolons are exchanged with an \"and\"."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def attributes(row):\n",
" bibkey(row)\n",
" title(row)\n",
" year(row)\n",
" pages(row)\n",
" number(row)\n",
" url(row)\n",
" abstract(row)\n",
" volume(row)#Stücknummer\n",
" shorttitle(row)\n",
" keywords(row)\n",
" language(row)\n",
" return bibkey(row) + title(row) + year(row) + number(row) + pages(row) + volume(row) + keywords(row) + url(row) + abstract(row) + shorttitle(row) + language(row) + \"\\n\"\n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"A function that writes individual bibliography entries including all attributes. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"def convert_row(row):\n",
" return rdfstarttag + attributes(row) + rdfendtag"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"rdfstarttag = ''''''\n",
"rdfendtag = \"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"bib = '\\n\\n'.join([convert_row(row) for row in data[1:]])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(bib[120000:140000])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Replace Item Type with BIB references."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"bib_rep_char = bib.replace(\"&\", \"\\&\").replace(\"#\", \"\\#\").replace(\"$\", \"\\$\").replace(\"%\", \"\\%\").replace(\"_\", \"\\_\").replace(\"~\", \"\\~\").replace(\"^\", \"\\^\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"rdf_escape = bib.replace(\"&\", \"&\").replace(\"z:shortTitle>LGBl.\",\"z:shortTitle>Lgbl.\").replace(\"z:shorttitle>LGBl.\",\"z:shorttitle>Lgbl.\").replace(\" \",\" \")\n",
"rdf_escape = rdf_escape.replace(\"1863 - 1922\",\"\").replace(\"1849 - 1935\",\"\").replace(\"1850 - 1995\",\"\").replace(\"1850 - 1929\",\"\").replace(\"1850 - 1960\",\"\").replace(\"1848 - 1918\",\"\").replace(\"1918 - 1930\",\"\").replace(\"1922 - 1980\",\"\").replace(\"1881 - 1918\",\"\").replace(\"1850 - 1918\",\"\").replace(\"1849 - 1915\",\"\").replace(\"1860 - 1918\",\"\").replace(\"1850 - 1858\",\"\").replace(\"1849 - 1917\",\"\").replace(\"1849 - 1920\",\"\").replace(\"1851 - 1918\",\"\").replace(\"1850 - 1859\",\"\")\n",
"rdf_escape = rdf_escape.replace(\" <\",\"<\").replace(\"> \",\">\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"bib_comment = '''@COMMENT{Data Source: Austrian National Library/Digitale Services/ALEX\n",
"Database Extraction: Andreas Wieser/Iris Klawatsch\n",
"Conversion: Stephan Kurz, Austrian Academy of Sciences\n",
"Licence: CC-0 (This is metadata only, freely available -- and it's describing non-copyrightable legal material)\n",
"Thanks for letting us reuse this data! \n",
"}\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"bib_final = ('% Encoding: UTF-8\\n\\n' + bib_comment + bib_rep_char + '\\n\\n@Comment{jabref-meta: databaseType:bibtex;}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"rdf_comment = '''\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"rdfstarttag = '\\n'\n",
"rdfendtag = '\\n'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"rdf_final = rdf_comment + rdfstarttag + rdf_escape + rdfendtag"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"print(rdf_final[:10000])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Write a bibtex bibliography file."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"f = open(\"lgbl_all.bib\", \"w\", encoding=\"utf8\")\n",
"f.write(bib_final)\n",
"f.close()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Write a Zotero RDF bibliography file."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"f = open(\"lgbl_all_extra.rdf\", \"w\", encoding=\"utf8\")\n",
"f.write(rdf_final)\n",
"f.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
}
},
"nbformat": 4,
"nbformat_minor": 4
}