gensim 中文文本相似度计算

待我称王封你为后i 2022-06-04 04:45 425阅读 0赞
  1. # -*- coding: utf-8 -*-
  2. # __jiahuiyu__
  3. import jieba
  4. import logging
  5. from gensim import corpora, models, similarities
  6. from collections import defaultdict
  7. logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
  8. def a_sub_b(a, b):
  9. ret = []
  10. for el in a:
  11. if el not in b:
  12. ret.append(el)
  13. return ret
  14. punctions = [' ', ',', '。', '、', '?', '~', '!', '@', '#', '¥', '%', '……', '&', '*', '(', ')', '——', '-', '`',
  15. ';', ':']
  16. # stop_list = set([word for word in open('e:/descfile/reprocess/stopword_ch.txt')])
  17. line_seg = []
  18. stop_list = [line.strip() for line in open('e:/descfile/reprocess/stopword_ch.txt').readlines()]
  19. print stop_list
  20. fr = open('e:/descfile/reprocess/documents_test.txt', 'r')
  21. document = fr.readlines()
  22. corpora_documents = []
  23. for line in document:
  24. item_str = []
  25. item = (jieba.cut(line.strip(), cut_all=False))
  26. for i in list(item):
  27. # i = i.encode('utf8')
  28. item_str.append(i)
  29. item_str = a_sub_b(item_str, list(stop_list))
  30. item_str = a_sub_b(item_str, list(punctions))
  31. corpora_documents.append(item_str)
  32. # print corpora_documents
  33. frequency = defaultdict(int)
  34. for sen in corpora_documents:
  35. for token in sen:
  36. frequency[token] += 1
  37. # print frequency
  38. texts = [[token for token in text if frequency[token] > 1] for text in corpora_documents]
  39. # print texts
  40. dictionary = corpora.Dictionary(texts)
  41. dictionary.save('e:/descfile/reprocess/test1.dict')
  42. corpus = [dictionary.doc2bow(text) for text in texts]
  43. corpora.MmCorpus.serialize('e:/descfile/reprocess/test1.mm', corpus)
  44. tfidf = models.TfidfModel(corpus)
  45. corpus_tfidf = tfidf[corpus]
  46. lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=5)
  47. corpus_lsi = lsi[corpus_tfidf]
  48. lsi.save('e:/descfile/reprocess/test1.lsi')
  49. # lsi = models.LsiModel.load('e:/descfile/reprocess/ch_model.lsi')
  50. index = similarities.MatrixSimilarity(lsi[corpus])
  51. index.save('e:/descfile/reprocess/test1.index')
  52. # index = similarities.MatrixSimilarity.load('e:/descfile/reprocess/ch_test.index')
  53. # test for similarity
  54. new_sen = "恶人多作怪"
  55. new_vec = dictionary.doc2bow(jieba.cut(new_sen, cut_all=False))
  56. # print new_vec
  57. lsi_vec = lsi[new_vec]
  58. # print lsi_vec
  59. sims = index[lsi_vec]
  60. print list(enumerate(sims))
  61. # sorted
  62. simsorted = sorted(enumerate(sims), key=lambda item: -item[1])
  63. print simsorted
  64. """ for sen in corpora_documents: for w in sen: if w not in stop_list and punctions: print w """

发表评论

表情:
评论列表 (有 0 条评论,425人围观)

还没有评论,来说两句吧...

相关阅读