There is a newer version of this record available.

Software Open Access

The TSL machine: parser, lemma analysis, sentiment analysis and autocoding for Telegram chats

Giovanni Spitale; Federico Germani; Nikola Biller - Andorno


DCAT Export

<?xml version='1.0' encoding='utf-8'?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:adms="http://www.w3.org/ns/adms#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dct="http://purl.org/dc/terms/" xmlns:dctype="http://purl.org/dc/dcmitype/" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:duv="http://www.w3.org/ns/duv#" xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:frapo="http://purl.org/cerif/frapo/" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:gsp="http://www.opengis.net/ont/geosparql#" xmlns:locn="http://www.w3.org/ns/locn#" xmlns:org="http://www.w3.org/ns/org#" xmlns:owl="http://www.w3.org/2002/07/owl#" xmlns:prov="http://www.w3.org/ns/prov#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:schema="http://schema.org/" xmlns:skos="http://www.w3.org/2004/02/skos/core#" xmlns:vcard="http://www.w3.org/2006/vcard/ns#" xmlns:wdrs="http://www.w3.org/2007/05/powder-s#">
  <rdf:Description rdf:about="https://doi.org/10.5281/zenodo.5533907">
    <rdf:type rdf:resource="http://www.w3.org/ns/dcat#Dataset"/>
    <dct:type rdf:resource="http://purl.org/dc/dcmitype/Software"/>
    <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#anyURI">https://doi.org/10.5281/zenodo.5533907</dct:identifier>
    <foaf:page rdf:resource="https://doi.org/10.5281/zenodo.5533907"/>
    <dct:creator>
      <rdf:Description rdf:about="http://orcid.org/0000-0002-6812-0979">
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">0000-0002-6812-0979</dct:identifier>
        <foaf:name>Giovanni Spitale</foaf:name>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>University of Zurich - Institute of Biomedical Ethics and History of Medicine</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description rdf:about="http://orcid.org/0000-0002-5604-0437">
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">0000-0002-5604-0437</dct:identifier>
        <foaf:name>Federico Germani</foaf:name>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>University of Zurich - Institute of Biomedical Ethics and History of Medicine</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:creator>
      <rdf:Description rdf:about="http://orcid.org/0000-0001-7661-1324">
        <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Agent"/>
        <dct:identifier rdf:datatype="http://www.w3.org/2001/XMLSchema#string">0000-0001-7661-1324</dct:identifier>
        <foaf:name>Nikola Biller - Andorno</foaf:name>
        <org:memberOf>
          <foaf:Organization>
            <foaf:name>University of Zurich - Institute of Biomedical Ethics and History of Medicine</foaf:name>
          </foaf:Organization>
        </org:memberOf>
      </rdf:Description>
    </dct:creator>
    <dct:title>The TSL machine: parser, lemma analysis, sentiment analysis and autocoding for Telegram chats</dct:title>
    <dct:publisher>
      <foaf:Agent>
        <foaf:name>Zenodo</foaf:name>
      </foaf:Agent>
    </dct:publisher>
    <dct:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#gYear">2021</dct:issued>
    <dcat:keyword>natural language processing</dcat:keyword>
    <dcat:keyword>NLP</dcat:keyword>
    <dcat:keyword>telegram</dcat:keyword>
    <dcat:keyword>covid-19</dcat:keyword>
    <dcat:keyword>social listening</dcat:keyword>
    <dcat:keyword>green pass</dcat:keyword>
    <dcat:keyword>vaccine</dcat:keyword>
    <dcat:keyword>freedom</dcat:keyword>
    <dcat:keyword>ethics</dcat:keyword>
    <dct:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2021-09-28</dct:issued>
    <dct:language rdf:resource="http://publications.europa.eu/resource/authority/language/ENG"/>
    <owl:sameAs rdf:resource="https://zenodo.org/record/5533907"/>
    <adms:identifier>
      <adms:Identifier>
        <skos:notation rdf:datatype="http://www.w3.org/2001/XMLSchema#anyURI">https://zenodo.org/record/5533907</skos:notation>
        <adms:schemeAgency>url</adms:schemeAgency>
      </adms:Identifier>
    </adms:identifier>
    <dct:isVersionOf rdf:resource="https://doi.org/10.5281/zenodo.5533906"/>
    <owl:versionInfo>1.0.0</owl:versionInfo>
    <dct:description>&lt;p&gt;The purpose of this tool is performing NLP analysis on Telegram chats. Telegram chats can be exported as .json files from the official client, Telegram Desktop (v. 2.9.2.0).&amp;nbsp;&lt;/p&gt; &lt;p&gt;The files are parsed, the content is used to populate a message dataframe, which is then anonymized.&amp;nbsp;&lt;/p&gt; &lt;p&gt;&lt;strong&gt;The software calculates and displays the following information:&lt;/strong&gt;&lt;/p&gt; &lt;ul&gt; &lt;li&gt;user count (n of users, new users per day, removed users per day);&lt;/li&gt; &lt;li&gt;message count (n and relative frequency of messages, messages per day);&lt;/li&gt; &lt;li&gt;autocoded messages (anonymized message dataframe with code weights assigned to each message based on a customizable set of regex rules);&lt;/li&gt; &lt;li&gt;prevalence of codes (n and relative frequency);&lt;/li&gt; &lt;li&gt;prevalence of lemmas&amp;nbsp;(n and relative frequency);&lt;/li&gt; &lt;li&gt;prevalence of lemmas segmented by autocode (n and relative frequency);&lt;/li&gt; &lt;li&gt;mean sentiment per day;&lt;/li&gt; &lt;li&gt;mean sentiment&amp;nbsp;segmented by autocode.&lt;/li&gt; &lt;/ul&gt; &lt;p&gt;&lt;strong&gt;The software outputs:&lt;/strong&gt;&lt;/p&gt; &lt;ul&gt; &lt;li&gt;messages_df_anon.csv - an anonymized file containing the progressive id of the message, the date, the univocal pseudonym of the sender, and the text;&lt;/li&gt; &lt;li&gt;usercount_df.csv - user count dataframe;&lt;/li&gt; &lt;li&gt;user_activity_df.csv - user activity dataframe;&lt;/li&gt; &lt;li&gt;messagecount_df.csv - message count dataframe;&lt;/li&gt; &lt;li&gt;messages_df_anon_coded.csv -&amp;nbsp;an anonymized file containing the progressive id of the message, the date, the univocal pseudonym of the sender,&amp;nbsp;the text, the codes, and the sentiment;&lt;/li&gt; &lt;li&gt;autocode_freq_df.csv - general prevalence of codes;&lt;/li&gt; &lt;li&gt;lemma_df.csv - lemma frequency;&lt;/li&gt; &lt;li&gt;autocode_freq_df_[rule_name].csv - lemma frequency in coded messages, one file per rule;&lt;/li&gt; &lt;li&gt;daily_sentiment_df.csv - daily sentiment;&lt;/li&gt; &lt;li&gt;sentiment_by_code_df.csv - sentiment segmented by code;&lt;/li&gt; &lt;li&gt;messages_anon.txt - anonymized text file generated from the message data frame, for easy import in other software for text mining or qualitative analysis;&lt;/li&gt; &lt;li&gt;messages_anon_MaxQDA.txt - anonymized text file generated from the message data frame, formatted specifically for MaxQDA (to track speakers and codes).&lt;/li&gt; &lt;/ul&gt; &lt;p&gt;Dependencies:&lt;/p&gt; &lt;ul&gt; &lt;li&gt;pandas (1.2.1)&lt;/li&gt; &lt;li&gt;json&lt;/li&gt; &lt;li&gt;random&lt;/li&gt; &lt;li&gt;os&lt;/li&gt; &lt;li&gt;re&lt;/li&gt; &lt;li&gt;tqdm (4.62.2)&lt;/li&gt; &lt;li&gt;datetime (4.3)&lt;/li&gt; &lt;li&gt;matplotlib (3.4.3)&lt;/li&gt; &lt;li&gt;Spacy (3.1.2) + it_core_news_md&lt;/li&gt; &lt;li&gt;wordcloud (1.8.1)&lt;/li&gt; &lt;li&gt;Counter&lt;/li&gt; &lt;li&gt;feel_it (1.0.3)&lt;/li&gt; &lt;li&gt;torch (1.9.0)&lt;/li&gt; &lt;li&gt;numpy (1.21.1)&lt;/li&gt; &lt;li&gt;transformers (4.3.3)&lt;/li&gt; &lt;/ul&gt; &lt;p&gt;This code is optimized for Italian.&amp;nbsp;&lt;/p&gt; &lt;p&gt;Lemma analysis is based on spaCy, which provides several other models for other languages (&amp;nbsp;&lt;a href="https://spacy.io/models"&gt;https://spacy.io/models&lt;/a&gt;&amp;nbsp;) so it can easily be adapted.&lt;/p&gt; &lt;p&gt;Sentiment analysis is performed using &lt;a href="https://github.com/MilaNLProc/feel-it"&gt;FEEL-IT: Emotion and Sentiment Classification for the Italian Language&lt;/a&gt;&amp;nbsp;(Kudos to Federico Bianchi &amp;lt;f.bianchi@unibocconi.it&amp;gt;; Debora Nozza &amp;lt;debora.nozza@unibocconi.it&amp;gt;; and Dirk Hovy &amp;lt;dirk.hovy@unibocconi.it&amp;gt;). Their work is specific for Italian. To perform sentiment analysis in other languages one could consider nltk.sentiment&lt;/p&gt; &lt;p&gt;The code is structured in a Jupyter-lab notebook, heavily commented for future reference.&lt;/p&gt;</dct:description>
    <dct:description>{"references": ["Bianchi F, Nozza D, Hovy D. FEEL-IT: Emotion and Sentiment Classification for the Italian Language. In: Proceedings of the 11th Workshop on Computational Approaches to Subjectivity, Sentiment and Social Media Analysis. Association for Computational Linguistics; 2021. https://github.com/MilaNLProc/feel-it"]}</dct:description>
    <dct:accessRights rdf:resource="http://publications.europa.eu/resource/authority/access-right/PUBLIC"/>
    <dct:accessRights>
      <dct:RightsStatement rdf:about="info:eu-repo/semantics/openAccess">
        <rdfs:label>Open Access</rdfs:label>
      </dct:RightsStatement>
    </dct:accessRights>
    <dcat:distribution>
      <dcat:Distribution>
        <dct:license rdf:resource="https://creativecommons.org/licenses/by/4.0/legalcode"/>
        <dcat:accessURL rdf:resource="https://doi.org/10.5281/zenodo.5533907"/>
      </dcat:Distribution>
    </dcat:distribution>
    <dcat:distribution>
      <dcat:Distribution>
        <dcat:accessURL rdf:resource="https://doi.org/10.5281/zenodo.5533907"/>
        <dcat:byteSize>5903284</dcat:byteSize>
        <dcat:downloadURL rdf:resource="https://zenodo.org/record/5533907/files/telegram social listening v1.0.0.zip"/>
        <dcat:mediaType>application/zip</dcat:mediaType>
      </dcat:Distribution>
    </dcat:distribution>
  </rdf:Description>
</rdf:RDF>
290
17
views
downloads
All versions This version
Views 29015
Downloads 172
Data volume 63.4 MB11.8 MB
Unique views 26515
Unique downloads 152

Share

Cite as