This tool allows to parse .json files containing telegram chats generated with Telegram desktop (v. 2.9.2.0). It takes as input a .json chat file, and outputs:
Dependencies:
### IMPORTS
import json
import pandas as pd
import random
import os
import re
from pandas.tseries.offsets import DateOffset
from tqdm import tqdm
from datetime import datetime
import subprocess
import matplotlib.pyplot as plt
import spacy
import it_core_news_md
from wordcloud import WordCloud
from collections import Counter
from IPython.display import clear_output, Markdown
# Spacy additional configuration - custom stopwords can be added here
stoplist = ["text", "type", "e", "essere", "il", "", " ", "place", "name", "\\n", "\\n\\n", "o"]
nlp = it_core_news_md.load()
spacy_stopwords = spacy.lang.it.stop_words.STOP_WORDS
nlp.max_length = 100000000
# print in markdown (just to make things prettier)
def printmd(string, color=None):
colorstr = "<span style='color:{}'>{}</span>".format(color, string)
display(Markdown(colorstr))
### SENTIMENT ANALYSIS
# Optimized for italian (kudos to Federico Bianchi, Debora Nozza and Dirk Hovy - https://github.com/MilaNLProc/feel-it )
# in case it's needed to do SA on EN text, consider using TextBlob instead.
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("MilaNLProc/feel-it-italian-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("MilaNLProc/feel-it-italian-sentiment")
files = [f for f in os.listdir('jsons/') if re.match(r'.*\.json', f)]
printmd('\n\n### available files: \n\n', color = "black")
for x in files:
print (x)
filename = input('\nSelect the file to work with:')
### MESSAGE RAW DATAFRAME
# Opening and loading the Json exported from Telegram desktop
f = open('jsons/'+filename, encoding="utf8")
x = f.read()
y = json.loads(x)
# Creating the message dataframe fom Json
df1 = pd.DataFrame.from_dict(y)
messages_df = pd.DataFrame(list(df1['messages']))
messages_df['date'] = pd.to_datetime(messages_df['date'], format='%Y-%m-%dT%H:%M:%S')
# Creating df for analysis of user flow
messages_df_users = messages_df.loc[messages_df['type'] == "service"]
printmd('\n\n### File \"' + filename + '\" successfully loaded\n\n', color = "green")
Here we use service messages from the chat to analyse the growth or decline of the total amount of users.
Important: not having the user count at the beginning of the chat, the initial count is calculated using the current amount of users and the users' variation. This proved correct in test chats, but can yield negative results in larger ones. Still looking for a possible explanation
In synthesis:
### COUNT USERS
current_users = input('\nPlease specify the amount of users in the chat at the moment of the data collection:')
current_users = int(current_users)
## Function to count joined, invited and removed users in a given dataframe
def count_users(dataframe_slice):
joined = 0
invited = 0
removed = 0
for index, row in dataframe_slice.iterrows():
if row['action'] == 'join_group_by_link':
joined = joined + 1
if row['action'] == 'invite_members':
invited = invited + len(row['members'])
if row['action'] == 'remove_members':
removed = removed + len(row['members'])
return(joined, invited, removed)
# get first and last day
datelist = messages_df_users['date'].dt.date.tolist()
startday = datelist[0]
lastday = datelist[-1]
# make list of days
dayslist = pd.date_range(start=startday,end=lastday).tolist()
#slice dataframe by time
timeframes = {}
for day in tqdm(dayslist):
for index, row in messages_df_users.iterrows():
nextday = day + DateOffset(days=1)
mask = (messages_df_users['date'] >= day) & (messages_df_users['date'] < nextday)
df = messages_df_users.loc[mask]
timeframes[str(day)[:10]] = df
# count users in each df in timeframes
usercount = {}
for key, item in tqdm(timeframes.items()):
count = count_users(item)
joined = count[0]
invited = count[1]
removed = count[2]
usercount[key] = [joined, invited, removed]
usercount_df = pd.DataFrame.from_dict(usercount).T
usercount_df.columns = ['joined', 'invited', 'removed']
users_joined = sum(usercount_df['joined'])
users_invited = sum(usercount_df['invited'])
users_removed = sum(usercount_df['removed'])
#initial users, either from json or computed in case of group migration
if 'create_group' in messages_df['action'].tolist():
initial_users = len((messages_df.loc[messages_df['action'] == 'create_group'])['members'].tolist()[0])
if 'create_group' not in messages_df['action'].tolist():
initial_users = current_users - users_joined - users_invited
x = 0
total = []
for index, row in tqdm(usercount_df.iterrows()):
if x == 0:
x = row['joined'] + row['invited'] + initial_users
else:
x = x + row['joined'] + row['invited']
total.append(x)
usercount_df['total (estimated)'] = total
print("current users: ", current_users)
print("initial users: ", initial_users)
print("users who joined: ", users_joined)
print("invited users: ", users_invited)
print("removed users: ", users_removed)
display(usercount_df)
100%|██████████| 1/1 [00:00<00:00, 1018.53it/s] 100%|██████████| 1/1 [00:00<?, ?it/s] 1it [00:00, 959.79it/s]
current users: 2 initial users: 2 users who joined: 0 invited users: 0 removed users: 0
joined | invited | removed | total (estimated) | |
---|---|---|---|---|
2021-09-21 | 0 | 0 | 0 | 2 |
# Plotting users per day
t = usercount_df.index.tolist()
s = usercount_df['total (estimated)']
fig, ax = plt.subplots()
ax.plot(t, s)
ax.xaxis.set_major_locator(plt.MaxNLocator(10))
ax.set(xlabel='Day', ylabel='Users', title='Users per day (estimated)')
ax.grid()
plt.xticks(rotation=90)
plt.show()
The files provided with the notebook contain italian names and toponyms (kudos to Phelipe de Sterlich and ISTAT). Said files can easily be replaced if need be.
To acheive a higher degree of precision, this is done with regex. It takes time, be patient.
IMPORTANT: Surnames are not removed from messages. The reason is: very seldom people refer to other members of the chat or to themselves using the surname. Surnames are more often used to refer to public figures or sources of information, and are thus a valuable information for the analysis. This could be esily done if need be copypasting some lines of code ad using a list line this one Keeping this in mind, even if rather comprehensive and accurate, the anonymization process does not guarantee the absence of other identifiers in the text. Therefore, it is suggested to release datasets generated with this software as "available upon request".
### ANONYMIZATION + PSEUDONYMIZATION
# Removing lines that are not messages (e.g: user joined the chat) - the original ID of the entry is preserved.
messages_df_anon = messages_df.loc[messages_df['action'].isna()]
# Anonymization, 1st step: removing all the metadata
messages_df_anon = messages_df_anon[['id', 'date', 'text', 'from_id']]
# Anonymization, 2nd step: replacing user IDs with random names
# get list of users
userlist = messages_df_anon['from_id'].tolist()
# Pseudonymization: open list of common names in Italy
namesfile = open('resources/names.txt', encoding="utf8")
nameslist = namesfile.readlines()
namesfile.close()
nameslist_ = []
for x in nameslist:
y = x.rstrip()
nameslist_.append(y)
nameslist = nameslist_
# Pseudonymization: replace user id with univocal name
secret = {}
pseudonyms = []
printmd('\n\n### Pseudonymization: replace user id with univocal name\n\n', color = "black")
for x in tqdm(userlist):
if x in secret.keys():
newname = secret[x]
pseudonyms.append(newname)
else:
newname = random.choice(nameslist)
secret[x] = newname
pseudonyms.append(newname)
# appending list of pseudonyms to df and dropping user id
messages_df_anon.insert(4, 'pseudonym', pseudonyms)
messages_df_anon = messages_df_anon.drop(columns=['from_id'])
# Open list of italian municipalities (source: ISTAT)
placesfile = open('resources/places.txt', encoding="utf8")
placeslist = placesfile.readlines()
placesfile.close()
placeslist_ = []
for x in placeslist:
y = x.rstrip()
z = y.capitalize()
placeslist_.append(z)
placeslist = placeslist_
# df messages to list
non_anonymized_messages = messages_df_anon['text'].tolist()
printmd('\n\n### Anonymization: remove names and places and user mentions (e.g: @thisuser)< from the messages\n\n', color = "black")
# Create long regex for names
long_regex_names = []
for x in tqdm(nameslist):
long_regex_names.append(x)
long_regex_names = '|'.join(long_regex_names)
long_regex_names = '\\b(' + long_regex_names + ')\\b'
regex_names = re.compile(long_regex_names, re.IGNORECASE)
# Create long regex for places
long_regex_places = []
for x in tqdm(placeslist):
long_regex_places.append(x)
long_regex_places = '|'.join(long_regex_places)
long_regex_places = '\\b(' + long_regex_places + ')\\b'
regex_places = re.compile(long_regex_places, re.IGNORECASE)
# Anonymize messages using regex
anonymized_messages = []
for message in tqdm(non_anonymized_messages):
message = str(message)
x = '\W@\w+\W(=?})'
regex_username = re.compile(x, re.IGNORECASE)
message = re.sub(regex_username, '[username]', message)
message = re.sub(regex_names, '[name]', message)
message = re.sub(regex_places, '[place]', message)
anonymized_messages.append(message)
# appending list of messages to df and dropping text column
messages_df_anon.insert(4, 'message', anonymized_messages)
messages_df_anon = messages_df_anon.drop(columns=['text'])
printmd('\n\n### Total messages: ' + str(len(messages_df_anon)) + '\n\n', color = "green")
display(messages_df_anon)
100%|██████████| 171/171 [00:00<?, ?it/s]
</span>
100%|██████████| 9063/9063 [00:00<00:00, 3005216.00it/s] 100%|██████████| 7903/7903 [00:00<00:00, 2642926.53it/s] 100%|██████████| 171/171 [00:02<00:00, 81.79it/s]
id | date | pseudonym | message | |
---|---|---|---|---|
1 | 63213 | 2021-09-21 16:21:41 | dominica | All'inizio siamo partiti che dire che il proge... |
2 | 63214 | 2021-09-21 16:21:52 | dominica | [name] di [name] Torre, [name] Ciarrapico e [n... |
3 | 63215 | 2021-09-21 16:22:02 | dominica | [name] fa ridere e riflettere [name] stesso te... |
4 | 63216 | 2021-09-21 16:22:13 | dominica | Ciò che è successo il giorno della conferenza ... |
5 | 63217 | 2021-09-21 16:22:25 | dominica | Crediamo che sia meravigliosa la longevità di ... |
... | ... | ... | ... | ... |
167 | 63445 | 2021-09-24 10:13:22 | dominica | A causa delle loro dimensioni e della loro nat... |
168 | 63446 | 2021-09-24 10:13:29 | dominica | Se sei il [name] proprietario di un uccello o ... |
169 | 63447 | 2021-09-24 10:13:57 | dominica | Questa è la prima cosa. [name] il green pass p... |
170 | 63448 | 2021-09-24 10:14:17 | dominica | La molla del linciaggio è la ricerca del creti... |
171 | 63449 | 2021-09-24 10:14:46 | dominica | Due gioielli politici si sono opposti con tutt... |
171 rows × 4 columns
Here we calculate how many users are active (at least 2 messages sent), and how many are "very active" (arbitrarily defined as users in the 75% quantile).
Important: every "very active user" is by definition also an "active user". Hence, to plot them in a meaningful way we calculate and plot the amount of users who are "active" but not "very active".
Important: it is possible that a former user has been a very active user, hence the percentage is calculated on "total users", not on "total current users"
### USER ACTIVITY
# Count frequencies and normalized frquencies of messages per user
s1 = messages_df_anon['pseudonym'].value_counts()
s2 = messages_df_anon['pseudonym'].value_counts(normalize = True)
user_activity_df = pd.concat([s1, s2], axis=1)
user_activity_df.columns = ['count', 'frequency']
# total active users (current + removed)
total_users = len(user_activity_df)
# interacting users (at least 2 messages)
active_v = len(user_activity_df[user_activity_df['count'] >= 2])
active_percent = round(active_v / total_users * 100, 2)
# very active users (75% quantile, the definition can be adjusted here)
v_active_value = user_activity_df['count'].quantile(.75)
mask = user_activity_df['count'] >= v_active_value
v_active = len(user_activity_df.loc[mask])
v_active_percent = round(v_active / total_users * 100, 2)
print('\nTotal active users (at least 1 message, including former active users): ' + str(total_users))
print('Interacting users (at least two messages): ' + str(active_v) + ' (' + str(active_percent) + '%)')
print('Very active users (message count in 75% quantile): ' + str(v_active) + ' (' + str(v_active_percent) + '%)')
printmd('\n\n### Frequencies of messages of active users:\n')
display(user_activity_df)
display(user_activity_df['count'].describe())
Total active users (at least 1 message, including former active users): 2 Interacting users (at least two messages): 2 (100.0%) Very active users (message count in 75% quantile): 1 (50.0%)
count | frequency | |
---|---|---|
dominica | 151 | 0.883041 |
rodrigo | 20 | 0.116959 |
count 2.000000 mean 85.500000 std 92.630988 min 20.000000 25% 52.750000 50% 85.500000 75% 118.250000 max 151.000000 Name: count, dtype: float64
# Pie chart with users
only_active_percent = active_percent - v_active_percent
remaining = 100 - only_active_percent - v_active_percent
labels = 'Active users', 'Very active users', 'Passive users'
sizes = [only_active_percent, v_active_percent, remaining]
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
ax1.axis('equal')
plt.title('Users per level of activity')
plt.show()
Here we calculate the amount of messages written to the chat every day.
### COUNT MESSAGES
# get first and last day
datelist = messages_df_anon['date'].dt.date.tolist()
startday = datelist[0]
lastday = datelist[-1]
# make list of days
dayslist = pd.date_range(start=startday,end=lastday).tolist()
#slice dataframe by time
timeframes = {}
for day in tqdm(dayslist):
nextday = day + DateOffset(days=1)
mask = (messages_df_anon['date'] >= day) & (messages_df_anon['date'] < nextday)
df = messages_df_anon.loc[mask]
timeframes[str(day)[:10]] = df
# count messages in each df in timeframes
messagecount = {}
for key, item in timeframes.items():
messages = len(item)
messagecount[key] = [messages]
messagecount_df = pd.DataFrame.from_dict(messagecount).T
messagecount_df.columns = ['count']
display(messagecount_df)
100%|██████████| 4/4 [00:00<00:00, 1998.95it/s]
count | |
---|---|
2021-09-21 | 56 |
2021-09-22 | 57 |
2021-09-23 | 2 |
2021-09-24 | 56 |
# Plotting messages per day
t = messagecount_df.index.tolist()
s = messagecount_df['count']
fig, ax = plt.subplots()
ax.plot(t, s)
ax.xaxis.set_major_locator(plt.MaxNLocator(10))
ax.set(xlabel='Day', ylabel='messages', title='Messages per day')
ax.grid()
plt.xticks(rotation=90)
plt.show()
Here we use dictionary files for autocoding entire messages. The assumption is that a message in a large group chat can be consideret as a minimal conceptual unit, i.e. a text in which a user develops one main argument or touches one main topic. Hence, if one or more rules from a dict fire for a given message, that message is autocoded as belonging to that dict.
The dictionary files are plain text files stored in /dict; the name of the file is used as the name of the code defined by the rules contained in the file.
The rules are written in regex, e.g: 'vaccin.*' will capture 'vaccine', 'vaccines', 'vaccination', and so on.
Regex allows the definition of fairly complex rules. As an example:
(tesser.\sverd.?|pass\sverd.?|certifica\w*\sverd.?)
This rule will fire on "tessera verde" or "tessere verdi" or "pass verde" or "certificato verde", but not for "casa verde" or "verderame" or "tessera del cinema".
For more details on regex and to develop and test new rules, check regex101.
The code has a weight system: if only one rule from the dict fires, the autocode has a weight of 1, if 2 rules fire, the weight will be 2 and so on. MaxQDA does not support (yet) the import of weighted codes, but it might in the future. Moreover, these values can be used for further analyses and are thus exported in the dataframe.
### AUTOCODING
## Function for autocoding
def autocode(ditcname, testlist):
dictfilepath = 'dict/' + dictname + '.txt'
# open dict file
dictfile = open(dictfilepath, encoding="utf8")
dictlist = dictfile.readlines()
dictfile.close()
# create regex list from dict file
dictlist_ = []
for x in dictlist:
if not x.startswith("#"):
y = x.strip()
dictlist_.append(y)
dictlist = dictlist_
regexlist = []
for x in dictlist:
regex = re.compile(x, re.IGNORECASE)
regexlist.append(regex)
# Regex search and assign weight
weightlist = []
for message in testlist:
weight = 0
for y in regexlist:
if re.search(y, str(message)):
weight = weight +1
weightlist.append(weight)
# append to anonymized df
messages_df_anon_coded[dictname] = weightlist
messages_df_anon_coded = messages_df_anon
# messages to code
messagelist = messages_df_anon['message'].tolist()
# opening dict files and applying function
dictfiles = os.listdir('dict')
dictnamelist = []
for x in tqdm(dictfiles):
dictname = x.replace('.txt', '')
dictnamelist.append(dictname)
autocode(dictname, messagelist)
messages_df_anon_coded
100%|██████████| 7/7 [00:00<00:00, 146.14it/s]
id | date | pseudonym | message | covid-19 | freedom | green pass | links | parrot | university | vaccine | |
---|---|---|---|---|---|---|---|---|---|---|---|
1 | 63213 | 2021-09-21 16:21:41 | dominica | All'inizio siamo partiti che dire che il proge... | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 63214 | 2021-09-21 16:21:52 | dominica | [name] di [name] Torre, [name] Ciarrapico e [n... | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 63215 | 2021-09-21 16:22:02 | dominica | [name] fa ridere e riflettere [name] stesso te... | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 63216 | 2021-09-21 16:22:13 | dominica | Ciò che è successo il giorno della conferenza ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 | 63217 | 2021-09-21 16:22:25 | dominica | Crediamo che sia meravigliosa la longevità di ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
167 | 63445 | 2021-09-24 10:13:22 | dominica | A causa delle loro dimensioni e della loro nat... | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
168 | 63446 | 2021-09-24 10:13:29 | dominica | Se sei il [name] proprietario di un uccello o ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
169 | 63447 | 2021-09-24 10:13:57 | dominica | Questa è la prima cosa. [name] il green pass p... | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
170 | 63448 | 2021-09-24 10:14:17 | dominica | La molla del linciaggio è la ricerca del creti... | 1 | 0 | 1 | 0 | 0 | 0 | 1 |
171 | 63449 | 2021-09-24 10:14:46 | dominica | Due gioielli politici si sono opposti con tutt... | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
171 rows × 11 columns
Here we calculate the prevalence of the codes. The value used as 'count' represents the weight of the code, so the amount of times each one of the rules fired on each one of the messages. Normalization is performed dividing the 'count' value by the number of messages, and multiplying by 100.
### PREVALENCE OF CODES
# Create dataframe with count of codes
autocode_freq_dict = {}
for x in dictnamelist:
autocode_freq_dict[x] = sum(messages_df_anon_coded[x])
autocode_freq_df = pd.DataFrame.from_dict(autocode_freq_dict, orient='index', columns = ['count'])
# Get total number of messages and normalize codes as %
message_count = len(messages_df_anon_coded)
autocode_freq_df['frequency'] = autocode_freq_df['count'] / message_count * 100
display(autocode_freq_df)
count | frequency | |
---|---|---|
covid-19 | 38 | 22.222222 |
freedom | 8 | 4.678363 |
green pass | 3 | 1.754386 |
links | 4 | 2.339181 |
parrot | 13 | 7.602339 |
university | 3 | 1.754386 |
vaccine | 32 | 18.713450 |
# bar chart with code relative frequency
labels_plot = autocode_freq_df.index.tolist()
sizes_plot = autocode_freq_df['frequency'].tolist()
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.bar(labels_plot,sizes_plot)
ax.set(xlabel='Code', ylabel='Relative frequency %', title='Relative frequency of autocodes')
ax.grid()
plt.show()
Here we create a bag of words using the anonymized messages, lemmatize with Spacy, and calculate the frequecies of lemmas.
Important: remember to specify the correct linguistical model and eventually to add custom stopwords to the stoplist (first code cell of this notebook)
### LEMMA FREQUENCY
# Spacy preprocessing functions
def is_token_allowed(token):
if (not token or token.is_stop or token.is_punct or token in spacy_stopwords):
return False
return True
def preprocess_token(token):
if is_token_allowed:
token = token.lemma_.strip().lower()
token.replace('\\n', '')
return token
# Function to generate word cloud. expects: dataframe to use, column with entities, column with frequencies, maximum number of words to display, background color
def generate_wordcloud(df, column_entities, column_numbers, m_words, bg_color):
lemmalist = df[column_entities].tolist()
freqlist = df[column_numbers].tolist()
zip_iterator = zip(lemmalist, freqlist)
freq_dict = dict(zip_iterator)
wordcloud = WordCloud(width=1000, height=500, background_color = bg_color, min_font_size = 8, max_words=m_words, relative_scaling=0.3, normalize_plurals=False).generate_from_frequencies(freq_dict)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
# save and show the plot
#plt.savefig(exportdir + "/word clouds/" + "word cloud" + " - " + column_entities + ".svg")
plt.show()
# Building bag of words and preprocessing
# this takes some time with large corpora (building the "text" list, joining it into a string and applying the nlp pipeline). Please be patient.
text = messages_df_anon['message'].tolist()
text = '\n'.join([str(elem) for elem in text])
file_doc = nlp(text, disable=["tagger", "parser", "attribute_ruler", "ner"]) # disabling some components of the pipeline to speed up
complete_filtered_tokens = [preprocess_token(token) for token in file_doc if is_token_allowed(token)]
complete_filtered_tokens = [token for token in complete_filtered_tokens if token not in stoplist]
# Creating dataframe of lemmas and frequencies
counter = Counter(complete_filtered_tokens)
c_df = pd.DataFrame.from_dict(counter, orient='index').reset_index()
c_df.columns = ['lemma', 'count']
c_df['frequency'] = round((c_df['count'] / len(c_df) * 100), 4)
c_df = c_df.sort_values(by=['count'], ascending=False)
c_df.reset_index(inplace = True)
c_df = c_df.drop(columns=['index'])
lemma_df = c_df
# Stoplist reminder
printmd('\n\n### Stoplist:', color = "black")
print(stoplist)
display(lemma_df)
['text', 'type', 'e', 'essere', 'il', '', ' ', 'place', 'name', '\\n', '\\n\\n', 'o']
lemma | count | frequency | |
---|---|---|---|
0 | protagonista | 44 | 2.5086 |
1 | voce | 40 | 2.2805 |
2 | campo | 38 | 2.1665 |
3 | sapere | 21 | 1.1973 |
4 | tyler | 21 | 1.1973 |
... | ... | ... | ... |
1749 | microonde | 1 | 0.0570 |
1750 | bleu | 1 | 0.0570 |
1751 | cordon | 1 | 0.0570 |
1752 | hobby | 1 | 0.0570 |
1753 | ricoverare | 1 | 0.0570 |
1754 rows × 3 columns
# Generate word cloud from lemmas
generate_wordcloud(lemma_df, "lemma", "count", 50, "white")
Same as above, but instead of using a single bag of words we create a bag of words for each code.
code_dict = {}
for x in dictnamelist:
sliced_df = messages_df_anon[messages_df_anon[x] > 0]
code_dict[x] = sliced_df
code_df_dict = {}
for key, item in tqdm(code_dict.items()):
# Building bag of words and preprocessing
text = item['message'].tolist()
text = '\n'.join([str(elem) for elem in text])
file_doc = nlp(text, disable=["tagger", "parser", "attribute_ruler", "ner"]) # disabling some components of the pipeline to speed up
complete_filtered_tokens = [preprocess_token(token) for token in file_doc if is_token_allowed(token)]
complete_filtered_tokens = [token for token in complete_filtered_tokens if token not in stoplist]
# Creating dataframe of lemmas and frequencies
counter = Counter(complete_filtered_tokens)
c_df = pd.DataFrame.from_dict(counter, orient='index').reset_index()
# Manage exception: empty df (= no messages with one of the codes)
if len(c_df) == 0:
print("WARNING: the code \"" + key + "\" yields an empty df. No frequency of lemmas will be computed for \"" + key + "\".")
else:
c_df.columns = ['lemma', 'count']
c_df['frequency'] = round((c_df['count'] / len(c_df) * 100), 4)
c_df = c_df.sort_values(by=['count'], ascending=False)
c_df.reset_index(inplace = True)
c_df = c_df.drop(columns=['index'])
code_df_dict[key] = c_df
100%|██████████| 7/7 [00:00<00:00, 21.31it/s]
# Display dataframes and word clouds
for key, item in code_df_dict.items():
if item.empty:
message = ('\n### The dataframe for ' + key + ' is empty. Nothing to show. Bummer! \n')
printmd(message, color = "red")
else:
message = key
printmd('\n\n### ' + message, color = "black")
display(item)
generate_wordcloud(item, "lemma", "count", 50, "white")
printmd('\n\n***\n\n', color = "black")
lemma | count | frequency | |
---|---|---|---|
0 | virus | 12 | 2.2222 |
1 | coronavirus | 10 | 1.8519 |
2 | globale | 8 | 1.4815 |
3 | covid | 8 | 1.4815 |
4 | covid-19 | 7 | 1.2963 |
... | ... | ... | ... |
535 | intero | 1 | 0.1852 |
536 | colpire | 1 | 0.1852 |
537 | rapidità | 1 | 0.1852 |
538 | inesorabile | 1 | 0.1852 |
539 | cacciatore | 1 | 0.1852 |
540 rows × 3 columns
</span>
lemma | count | frequency | |
---|---|---|---|
0 | vaccinazione | 8 | 3.3195 |
1 | libertà | 7 | 2.9046 |
2 | credere | 6 | 2.4896 |
3 | raccontare | 4 | 1.6598 |
4 | fronte | 3 | 1.2448 |
... | ... | ... | ... |
236 | grande | 1 | 0.4149 |
237 | scienziato | 1 | 0.4149 |
238 | virus | 1 | 0.4149 |
239 | debole | 1 | 0.4149 |
240 | davvero | 1 | 0.4149 |
241 rows × 3 columns
</span>
lemma | count | frequency | |
---|---|---|---|
0 | green | 4 | 6.6667 |
1 | pass | 4 | 6.6667 |
2 | cretino | 3 | 5.0000 |
3 | addossare | 3 | 5.0000 |
4 | dare | 3 | 5.0000 |
5 | aprire | 3 | 5.0000 |
6 | c' | 2 | 3.3333 |
7 | volere | 2 | 3.3333 |
8 | opporre | 2 | 3.3333 |
9 | politico | 2 | 3.3333 |
10 | condizione | 1 | 1.6667 |
11 | decisione | 1 | 1.6667 |
12 | esponente | 1 | 1.6667 |
13 | lockdown | 1 | 1.6667 |
14 | migliaio | 1 | 1.6667 |
15 | ricoverati | 1 | 1.6667 |
16 | morto | 1 | 1.6667 |
17 | organizzare | 1 | 1.6667 |
18 | corteo | 1 | 1.6667 |
19 | mascherina | 1 | 1.6667 |
20 | signore | 1 | 1.6667 |
21 | contenimento | 1 | 1.6667 |
22 | ospedale | 1 | 1.6667 |
23 | pericolo | 1 | 1.6667 |
24 | qual | 1 | 1.6667 |
25 | l’[name | 1 | 1.6667 |
26 | ginocchio | 1 | 1.6667 |
27 | scuola | 1 | 1.6667 |
28 | privo | 1 | 1.6667 |
29 | norma | 1 | 1.6667 |
30 | minimo | 1 | 1.6667 |
31 | sicurezza | 1 | 1.6667 |
32 | totale | 1 | 1.6667 |
33 | proposta | 1 | 1.6667 |
34 | salvini | 1 | 1.6667 |
35 | capitare | 1 | 1.6667 |
36 | giornata | 1 | 1.6667 |
37 | socializzazione | 1 | 1.6667 |
38 | molla | 1 | 1.6667 |
39 | linciaggio | 1 | 1.6667 |
40 | ricerca | 1 | 1.6667 |
41 | svegliare | 1 | 1.6667 |
42 | deputato | 1 | 1.6667 |
43 | covid | 1 | 1.6667 |
44 | dovere | 1 | 1.6667 |
45 | sieropositivo | 1 | 1.6667 |
46 | mezzo | 1 | 1.6667 |
47 | mattina | 1 | 1.6667 |
48 | sentire | 1 | 1.6667 |
49 | lo | 1 | 1.6667 |
50 | dire | 1 | 1.6667 |
51 | grosso | 1 | 1.6667 |
52 | accanito | 1 | 1.6667 |
53 | cacciatore | 1 | 1.6667 |
54 | pensare | 1 | 1.6667 |
55 | gioiello | 1 | 1.6667 |
56 | forza | 1 | 1.6667 |
57 | meloni | 1 | 1.6667 |
58 | cosa | 1 | 1.6667 |
59 | ricoverare | 1 | 1.6667 |
</span>
lemma | count | frequency | |
---|---|---|---|
0 | link | 2 | 28.5714 |
1 | https://it.wikiquote.org/wiki/pandemia_di_covi... | 1 | 14.2857 |
2 | buttare | 1 | 14.2857 |
3 | mention_name | 1 | 14.2857 |
4 | user_id | 1 | 14.2857 |
5 | 1971944511 | 1 | 14.2857 |
6 | https://it.wikiquote.org/wiki/pandemia_di_covi... | 1 | 14.2857 |
</span>
lemma | count | frequency | |
---|---|---|---|
0 | pappagallo | 8 | 4.7904 |
1 | cocorite | 6 | 3.5928 |
2 | parlare | 5 | 2.9940 |
3 | uccello | 5 | 2.9940 |
4 | animale | 5 | 2.9940 |
... | ... | ... | ... |
162 | intelligente | 1 | 0.5988 |
163 | incredibilmente | 1 | 0.5988 |
164 | vivace | 1 | 0.5988 |
165 | senso | 1 | 0.5988 |
166 | cocorita | 1 | 0.5988 |
167 rows × 3 columns
</span>
lemma | count | frequency | |
---|---|---|---|
0 | arma | 4 | 2.7972 |
1 | biologico | 4 | 2.7972 |
2 | internazionale | 2 | 1.3986 |
3 | intendere | 2 | 1.3986 |
4 | contrario | 2 | 1.3986 |
... | ... | ... | ... |
138 | 500 | 1 | 0.6993 |
139 | vero | 1 | 0.6993 |
140 | equilibrio | 1 | 0.6993 |
141 | covid | 1 | 0.6993 |
142 | rafforzare | 1 | 0.6993 |
143 rows × 3 columns
</span>
lemma | count | frequency | |
---|---|---|---|
0 | vaccino | 12 | 2.8369 |
1 | vaccinazione | 10 | 2.3641 |
2 | salute | 6 | 1.4184 |
3 | malattia | 5 | 1.1820 |
4 | questione | 4 | 0.9456 |
... | ... | ... | ... |
418 | popolazione | 1 | 0.2364 |
419 | gran | 1 | 0.2364 |
420 | annientare | 1 | 0.2364 |
421 | gates | 1 | 0.2364 |
422 | cacciatore | 1 | 0.2364 |
423 rows × 3 columns
</span>
Sentiment analysis calculates the probability of positive or negative sentiment per each message. This is performed using FEEL-IT: Emotion and Sentiment Classification for the Italian Language. Kudos to Federico Bianchi f.bianchi@unibocconi.it; Debora Nozza debora.nozza@unibocconi.it; and Dirk Hovy dirk.hovy@unibocconi.it for their amazing work.
In order to bin the sentiment probability and to plot it, we define "positive sentiment" when the relative probability of positive sentiment is > 0.75 and "negative sentiment" when the relative probability of negative sentiment is > 0.75.
def sentiment_analysis(x):
sentence = x
inputs = tokenizer(sentence, return_tensors="pt")
# Call the model and get the logits
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(**inputs, labels=labels)
loss, logits = outputs[:2]
logits = logits.squeeze(0)
# Extract probabilities
proba = torch.nn.functional.softmax(logits, dim=0)
# Unpack the tensor to obtain negative and positive probabilities
negative, positive = proba
neg_prob = np.round(negative.item(),4)
pos_prob = np.round(positive.item(),4)
return(pos_prob, neg_prob)
sentimentlist = []
for x in tqdm(messagelist):
if len(x) > 0:
try:
sentiment = sentiment_analysis(str(x))
sentimentlist.append(sentiment)
except:
sentiment = ('NA', 'NA')
sentimentlist.append(sentiment)
else:
sentiment = ('NA', 'NA')
sentimentlist.append(sentiment)
# append sentiment to anonymized dataframe
positivelist, negativelist = zip(*sentimentlist)
messages_df_anon['sentiment_pos'] = positivelist
messages_df_anon['sentiment_neg'] = negativelist
messages_df_anon = messages_df_anon.replace('NA', np.NaN)
#slice sentiment dataframe by time
timeframes_sentiment = {}
for day in tqdm(dayslist):
for index, row in messages_df_anon.iterrows():
nextday = day + DateOffset(days=1)
mask = (messages_df_anon['date'] >= day) & (messages_df_anon['date'] < nextday)
df = messages_df_anon.loc[mask]
timeframes_sentiment[str(day)[:10]] = df
# create a dataframe with mean sentiment per day
sentiment_mean_dict = {}
for key, item in timeframes_sentiment.items():
positive_mean = item['sentiment_pos'].mean()
negative_mean = item['sentiment_neg'].mean()
sentiment_mean_dict[key] = (positive_mean, negative_mean)
daily_sentiment_df = pd.DataFrame.from_dict(sentiment_mean_dict).T
daily_sentiment_df.columns = ['positive', 'negative']
daily_sentiment_df
100%|██████████| 171/171 [00:10<00:00, 16.45it/s] 100%|██████████| 4/4 [00:00<00:00, 10.64it/s]
positive | negative | |
---|---|---|
2021-09-21 | 0.336329 | 0.663671 |
2021-09-22 | 0.158732 | 0.841268 |
2021-09-23 | 0.499950 | 0.500050 |
2021-09-24 | 0.141189 | 0.858811 |
# Plotting mean sentiment by code
code_dict = {}
for x in dictnamelist:
sliced_df = messages_df_anon[messages_df_anon[x] > 0]
code_dict[x] = sliced_df
sentiment_by_code = {}
for key, item in code_dict.items():
positive_mean = item['sentiment_pos'].mean()
negative_mean = item['sentiment_neg'].mean()
sentiment_by_code[key] = (positive_mean, negative_mean)
sentiment_by_code_df = pd.DataFrame.from_dict(sentiment_by_code).T
sentiment_by_code_df.columns = ['positive', 'negative']
sentiment_by_code_df
positive | negative | |
---|---|---|
covid-19 | 0.035784 | 0.964216 |
freedom | 0.375025 | 0.624975 |
green pass | 0.333367 | 0.666633 |
links | 0.000350 | 0.999650 |
parrot | 0.546236 | 0.453764 |
university | 0.000200 | 0.999800 |
vaccine | 0.055769 | 0.944231 |
# Pie chart with sentiment
total_messages = len(messages_df_anon)
positive_messages = len(messages_df_anon[messages_df_anon['sentiment_pos'] > 0.75])
negative_messages = len(messages_df_anon[messages_df_anon['sentiment_neg'] > 0.75])
neutral_messages = total_messages - positive_messages - negative_messages
positive_percent = positive_messages*100/total_messages
negative_percent = negative_messages*100/total_messages
neutral_percent = neutral_messages*100/total_messages
labels = 'Positive sentiment', 'Negative sentiment', 'Neutral sentiment'
sizes = [positive_percent, negative_percent, neutral_percent]
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
ax1.axis('equal')
plt.title('Sentiment of messages')
plt.show()
# Plotting sentiment per day
t = daily_sentiment_df.index.tolist()
p = daily_sentiment_df['positive']
n = daily_sentiment_df['negative']
fig, ax = plt.subplots()
ax.plot(t, p)
ax.plot(t, n)
ax.legend(['Positive', 'Negative'])
ax.xaxis.set_major_locator(plt.MaxNLocator(10))
ax.set(xlabel='Day', ylabel='average sentiment probability', title='Sentiment per day')
ax.grid()
plt.xticks(rotation=90)
plt.show()
Here we export the files to be used for further analyses:
Tabular data in .csv files:
Structured data in text files:
Exports of the notebook as .html files:
### EXPORT
# Create export folder
dateTimeObj = datetime.now()
timestampStr = dateTimeObj.strftime("%d-%b-%Y - %H.%M.%S")
dirname = str(df1.iloc[0]['name']) + ' - ' + timestampStr
exportdir = 'export/'+dirname
os.mkdir(exportdir)
# Anonymized dataframe
messages_df_anon.to_csv(exportdir + '/messages_df_anon.csv', index=False, encoding="utf8", sep=';')
# User count dataframe
usercount_df.to_csv(exportdir + '/usercount_df.csv', index=True, index_label='day', encoding="utf8", sep=';')
# User activity dataframe
user_activity_df.to_csv(exportdir + '/user_activity_df.csv', index=True, index_label='user', encoding="utf8", sep=';')
# Message count dataframe
messagecount_df.to_csv(exportdir + '/messagecount_df.csv', index=True, index_label='day', encoding="utf8", sep=';')
# Autocoded messages
messages_df_anon_coded.to_csv(exportdir + '/messages_df_anon_coded.csv', index=False, encoding="utf8", sep=';')
# General prevalence of codes
autocode_freq_df.to_csv(exportdir + '/autocode_freq_df.csv', index=True, index_label='code', encoding="utf8", sep=';')
# Lemma frequency dataframe
lemma_df.to_csv(exportdir + '/lemma_freq_df.csv', index=False, encoding="utf8", sep=';')
# Lemma frequency in coded messages
for key, item in code_df_dict.items():
filename = '/autocode_freq_df_' + key + '.csv'
item.to_csv(exportdir + filename, index=True, index_label='code', encoding="utf8", sep=';')
# Daily sentiment
daily_sentiment_df.to_csv(exportdir + '/daily_sentiment_df.csv', index=True, encoding="utf8", sep=';')
# Sentiment by code
sentiment_by_code_df.to_csv(exportdir + '/sentiment_by_code_df.csv', index=True, encoding="utf8", sep=';')
# Text file
with open(exportdir + '/messages_anon.txt', 'w', encoding='utf-8') as f:
f.write('')
with open(exportdir + '/messages_anon.txt', 'a', encoding='utf-8') as f:
for index, row in messages_df_anon.iterrows():
message = str(row['date']) + '\n' + row['pseudonym'] + ':\n' + str(row['message']) + '\n\n'
f.write(message)
# Prestructured text for MaxQDA (import via "import/import structured document")
# Info from MaxQDA:
'''With MAXQDA's preprocessor feature you can split a RTF- or DOC/X document into multiple documents and automatically code text segments during import.
The following tags need to be integrated into the imported document:
#TEXT<Text Name> to mark the beginning of a new text
#CODE<Code\Subcode> to mark the beginning of a new coded segment
#ENDCODE to mark the end of the coded segment
#SPEAKER <Speaker's Name> to mark the beginning of a contribution to a focus group
#ENDSPEAKER to mark the end of the contribution
It is not possible to imbed a code into another code.
After a tag MAXQDA expects the end of the paragraph. To integrate a tag in a paragraph's text add a # (this works not for #TEXT).
Use "\" for subcodes and "&&" to assign several codes for a segment.
Example:
#TEXT Name of first document
#CODE Code 1
This sentence will be coded with "Code 1".
#ENDCODE
#CODE Code 2\Subcode&&OTHERCODE
This sentence will be coded until here #ENDCODE# with "Subcode".
#TEXT Name of second document
This text will be placed into a second document.'''
with open(exportdir + '/messages_anon_MaxQDA.txt', 'w', encoding='utf-8') as f:
f.write('')
with open(exportdir + '/messages_anon_MaxQDA.txt', 'a', encoding='utf-8') as f:
f.write('#TEXT')
f.write('<'+ str(df1.iloc[0]['name']) + ' - Telegram chat>')
f.write('\n\n')
for index, row in messages_df_anon.iterrows():
f.write('#SPEAKER')
f.write('<' + row['pseudonym'] + '>')
f.write('\n')
check = 0
multicode = []
for x in dictnamelist:
multicode.append(row[x])
codelist = []
for x in dictnamelist:
if row[x] > 0:
codelist.append(x)
if sum(multicode) > 0:
f.write('#CODE')
for x in codelist:
f.write(' AUTOCODE\\' + x + ' &&')
f.write('\n')
message = str(row['date']) + '\n' + row['pseudonym'] + ':\n' + str(row['message'])
f.write(message)
f.write('\n')
if sum(multicode) > 0:
f.write('#ENDCODE\n')
f.write('#ENDSPEAKER')
f.write('\n\n')
# Save a clean export of the notebook
command = 'jupyter nbconvert --output-dir=\"' + exportdir + '\" --output "Report_clean.html" --to html --no-input TelegramSocialListening.ipynb'
subprocess.run(command, shell=True, check=True)
# Save an export of the notebook with the code
command = 'jupyter nbconvert --output-dir=\"' + exportdir + '\" --output "Report_complete.html" --to html TelegramSocialListening.ipynb'
subprocess.run(command, shell=True, check=True)
printmd('\n\n### All good, files exported!', color = "green")