import pandas as pd
import jieba
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
import numpy as np
from gensim.matutils import hellinger

from gensim import corpora, models
from matplotlib.sankey import Sankey


# 读取Excel文件中的文本数据
data1 = pd.read_excel('data-1.xlsx')
data2 = pd.read_excel('data-2.xlsx')
data3 = pd.read_excel('data-3.xlsx')
# 提取文本数据
text_data1 = data1["内容"].tolist()
# 提取文本数据
text_data2 = data2["内容"].tolist()
# 提取文本数据
text_data3 = data3["内容"].tolist()
# 初始化停用词列表和词形还原器
stop_words = set(stopwords.words("english"))
#定义自己的词典
jieba.load_userdict(r'C:\Users\summer\xiaolunwen\A-Ture\0.cidian\user_dict.txt')
# 读取停用词典
with open( r'C:\Users\summer\xiaolunwen\A-Ture\0.cidian\stop_words.txt', 'r', encoding='utf-8') as f:
    custom_stop_words = [line.strip() for line in f.readlines()]
#预处理
# 定义文本预处理函数
def preprocess_text(text):
    # 分词
    words = jieba.cut(text.lower())
    # 去除停用词和标点符号
    words = [word for word in words if word.isalpha() and word not in stop_words and word not in custom_stop_words]
   
    return words
# 对文本数据进行预处理
preprocessed_text1 = [preprocess_text(text) for text in (text_data1)]
print("完成")
preprocessed_text2 = [preprocess_text(text) for text in (text_data2)]
print("完成")
preprocessed_text3 = [preprocess_text(text) for text in (text_data3)]
print("完成")

#主题分析
# 创建字典
dictionary1 = corpora.Dictionary(preprocessed_text1)
dictionary2 = corpora.Dictionary(preprocessed_text2)
dictionary3 = corpora.Dictionary(preprocessed_text3)
print("1")

# 创建语料库
corpus1 = [dictionary1.doc2bow(text) for text in preprocessed_text1]
corpus2 = [dictionary2.doc2bow(text) for text in preprocessed_text2]
corpus3 = [dictionary2.doc2bow(text) for text in preprocessed_text3]

print("2")


# 构建LDA模型
lda1= models.LdaModel(corpus1, num_topics=7, id2word=dictionary1, passes=10)
lda2= models.LdaModel(corpus2, num_topics=11, id2word=dictionary2, passes=10)
lda3= models.LdaModel(corpus3, num_topics=12, id2word=dictionary2, passes=10)
print("完成")
#打印关键词
for i in range(7):
    print(f"Topic {i+1}: {lda1.print_topic(i)}")
for i in range(11):
    print(f"Topic {i+1}: {lda2.print_topic(i)}")
for i in range(12):
    print(f"Topic {i+1}: {lda3.print_topic(i)}")

#判断每句话属于哪个主题
# 获取每个文档的主题分布
doc_topics1 = [lda1.get_document_topics(doc) for doc in corpus1]
doc_topics2 = [lda2.get_document_topics(doc) for doc in corpus2]
doc_topics3 = [lda3.get_document_topics(doc) for doc in corpus3]
# 找到概率最高的主题
max_prob_topics1 = [max(topics, key=lambda x: x[1])[0] + 1 for topics in doc_topics1]
# 将主题添加到Excel数据中
data1['Topic'] = max_prob_topics1

# 找到概率最高的主题
max_prob_topics2 = [max(topics, key=lambda x: x[1])[0] + 1 for topics in doc_topics2]
# 将主题添加到Excel数据中
data2['Topic'] = max_prob_topics2

# 找到概率最高的主题
max_prob_topics3 = [max(topics, key=lambda x: x[1])[0] + 1 for topics in doc_topics3]
# 将主题添加到Excel数据中
data3['Topic'] = max_prob_topics3
# 保存带有主题标签的Excel数据
data1.to_excel('output_with_topics-1.xlsx', index=False)
data2.to_excel('output_with_topics-2.xlsx', index=False)
data3.to_excel('output_with_topics-3.xlsx', index=False)
print("完成")

#主题演化
##阶段1到阶段2
from sklearn.metrics.pairwise import cosine_similarity
# 获取lda1的主题向量
topic_vectors_lda1 = lda1.get_topics()
#获取lda2的主题向量
topic_vectors_lda2 = lda2.get_topics()#获取两个字典之间的公共术语
common_terms = set(lda1.id2word.token2id.keys()) & set(lda2.id2word.token2id.keys())
# 过滤主题向量，只包括常用术语
topic_vectors_lda1_common = np.array([[topic[idx] for idx, term in lda1.id2word.items() if term in common_terms] for topic in topic_vectors_lda1])
topic_vectors_lda2_common = np.array([[topic[idx] for idx, term in lda2.id2word.items() if term in common_terms] for topic in topic_vectors_lda2])
# 计算lda1和lda2之间的余弦相似度
similarity_matrix = cosine_similarity(topic_vectors_lda1_common, topic_vectors_lda2_common)
print("Cosine Similarity Matrix:")
print(similarity_matrix)
# 计算从lda1到lda2的转移概率
transition_probabilities = similarity_matrix / similarity_matrix.sum(axis=1, keepdims=True)
# 将小于0.1的值设为0
transition_probabilities[transition_probabilities < 0.12] = 0
print("Transition Probabilities:")
print(transition_probabilities)
print(" ")
# 归一化lda1中每个主题的转移概率
normalized_transition_probabilities = transition_probabilities / transition_probabilities.sum(axis=1, keepdims=True)
# 再次归一化，确保所有值都在0到1之间
normalized_transition_probabilities = normalized_transition_probabilities / normalized_transition_probabilities.sum(axis=1, keepdims=True)
print("Normalized Transition Probabilities:")
print(normalized_transition_probabilities)
# 为相似性定义一个阈值
threshold =0  # You can adjust this threshold as needed
# 匹配相似的主题并存储匹配
topic_matches = []
for i in range(len(lda1.get_topics())):
    most_similar_topic_idx = np.argmax(similarity_matrix[i])
    similarity_score = similarity_matrix[i, most_similar_topic_idx]
    if similarity_score > threshold:
        topic_matches.append((i, most_similar_topic_idx, similarity_score))
print("Topic Matches:")
for i, j, similarity in topic_matches:
    print(f"Topic {i} in LDA1 is similar to Topic {j} in LDA2 with similarity {similarity}")

##阶段2到阶段3
from sklearn.metrics.pairwise import cosine_similarity
#获取lda3的主题向量
topic_vectors_lda3 = lda3.get_topics()
#获取lda2的主题向量
topic_vectors_lda2 = lda2.get_topics()
#获取两个字典之间的公共术语
common_terms1 = set(lda2.id2word.token2id.keys()) & set(lda3.id2word.token2id.keys())
# 过滤主题向量，只包括常用术语
topic_vectors_lda2_common_1 = np.array([[topic[idx] for idx, term in lda2.id2word.items() if term in common_terms1] for topic in topic_vectors_lda2])
topic_vectors_lda3_common = np.array([[topic[idx] for idx, term in lda3.id2word.items() if term in common_terms1] for topic in topic_vectors_lda3])
# 计算lda2和lda3之间的余弦相似度
similarity_matrix1 = cosine_similarity(topic_vectors_lda2_common_1, topic_vectors_lda3_common)
print("Cosine Similarity Matrix:")
print(similarity_matrix1)
# 计算从lda2到lda3的转移概率
transition_probabilities1 = similarity_matrix1 / similarity_matrix1.sum(axis=1, keepdims=True)
# 将小于0.1的值设为0
transition_probabilities1[transition_probabilities1 < 0.13] = 0
print("Transition Probabilities:")
print(transition_probabilities1)
print(" ")
# 归一化lda1中每个主题的转移概率
normalized_transition_probabilities1 = transition_probabilities1 / transition_probabilities1.sum(axis=1, keepdims=True)
# 归一化lda2中每个主题的转移概率
normalized_transition_probabilities1 = transition_probabilities1 / transition_probabilities1.sum(axis=1, keepdims=True)
print("Normalized Transition Probabilities:")
print(normalized_transition_probabilities1)