NLP-文本处理:语义角色标注(触发者、谓语、受事者、时间..)【基于“句子A的分词列表B”+“B的词性列表C”+“B与C的依存句法分析结果”】-->关系抽取-->知识图谱

柔光的暖阳◎ 2023-10-05 16:56 65阅读 0赞

一、LTP进行语义角色标注

  1. import os
  2. from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller
  3. # pip install pyltp -i https://pypi.tuna.tsinghua.edu.cn/simple 可以先下载好whl文件
  4. #LTP语言平台:http://ltp.ai/index.html
  5. #咱们使用的工具包,pyltp:https://pyltp.readthedocs.io/zh_CN/latest/api.html
  6. #LTP附录:https://ltp.readthedocs.io/zh_CN/latest/appendix.html#id3
  7. #安装方法:https://github.com/HIT-SCIR/pyltp
  8. class LtpParser:
  9. def __init__(self):
  10. LTP_DIR = "./ltp_data_v3.4.0"
  11. self.segmentor = Segmentor() # 分词
  12. self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))
  13. self.postagger = Postagger() # 词性标注
  14. self.postagger.load(os.path.join(LTP_DIR, "pos.model"))
  15. self.parser = Parser() # 句法依存分析
  16. self.parser.load(os.path.join(LTP_DIR, "parser.model"))
  17. self.recognizer = NamedEntityRecognizer() # 命名实体识别
  18. self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))
  19. self.labeller = SementicRoleLabeller() # 语义角色标注
  20. self.labeller.load(os.path.join(LTP_DIR, 'pisrl_win.model'))
  21. # 语义角色标注
  22. def format_labelrole(self, words, postags):
  23. print("分词----> words= {0}----len(words) = {1}".format(words, len(words)))
  24. print("词性标注----> postags= {0}----len(postags) = {1}".format(postags, len(postags)))
  25. arcs = self.parser.parse(words, postags) # 建立依存句法分析树
  26. roles = self.labeller.label(words, postags, arcs)
  27. print("len(roles) = {0}----roles = {1}".format(len(roles), roles))
  28. roles_dict = {
  29. }
  30. for role in roles:
  31. print("谓语所在索引:role.index = {0}".format(role.index))
  32. roles_dict[role.index] = {
  33. arg.name:[arg.name,arg.range.start, arg.range.end] for arg in role.arguments}
  34. # {6: {'A0': ['A0', 0, 2], 'TMP': ['TMP', 3, 3], 'LOC': ['LOC', 4, 5], 'A1': ['A1', 8, 8]}}
  35. # 6:表示谓语(发表)所在序号;
  36. # A0:表示“施事者、主体、触发者”,0,2分别表示A0所在的起始索引、终止索引(此句中有2个A0,分别是“奥巴马”、“克林顿”,索引范围是是0-2)
  37. # TMP:表示“时间”,3, 3分别表示TMP所在的起始索引、终止索引(“昨晚”)
  38. # LOC:表示“地点”,4, 5分别表示LOC所在的起始索引、终止索引(“在”,“白宫”)
  39. # A1:表示“受事者”,8, 8分别表示LOC所在的起始索引、终止索引(“演说”)
  40. print("语义角色标注---->roles_dict = {0}".format(roles_dict))
  41. return roles_dict
  42. '''parser主函数'''
  43. def parser_main(self, sentence):
  44. # 分词
  45. words = list(self.segmentor.segment(sentence))
  46. # 词性标注
  47. postags = list(self.postagger.postag(words))
  48. # 语义角色标注
  49. roles_dict = self.format_labelrole(words, postags)
  50. return words, postags, roles_dict
  51. if __name__ == '__main__':
  52. parse = LtpParser()
  53. sentence = '奥巴马与克林顿昨晚在白宫发表了演说'
  54. words, postags, roles_dict = parse.parser_main(sentence)

输出结果:

  1. 分词----> words= ['奥巴马', '与', '克林顿', '昨晚', '在', '白宫', '发表', '了', '演说']----len(words) = 9
  2. 词性标注----> postags= ['nh', 'p', 'nh', 'nt', 'p', 'n', 'v', 'u', 'v']----len(postags) = 9
  3. len(roles) = 1----roles = <pyltp.SementicRoles object at 0x000002170D3CC210>
  4. 谓语所在索引:role.index = 6
  5. 语义角色标注---->roles_dict = {
  6. 6: {
  7. 'A0': ['A0', 0, 2], 'TMP': ['TMP', 3, 3], 'LOC': ['LOC', 4, 5], 'A1': ['A1', 8, 8]}}
  8. Process finished with exit code 0

二、“语义角色标注”结果分析:

{6: {'A0': ['A0', 0, 2], 'TMP': ['TMP', 3, 3], 'LOC': ['LOC', 4, 5], 'A1': ['A1', 8, 8]}}

  • 6:表示谓语(发表)所在序号;
  • A0:表示“施事者、主体、触发者”,0,2分别表示A0所在的起始索引、终止索引(此句中有2个A0,分别是“奥巴马”、“克林顿”,索引范围是是0-2)
  • TMP:表示“时间”,3, 3分别表示TMP所在的起始索引、终止索引(“昨晚”)
  • LOC:表示“地点”,4, 5分别表示LOC所在的起始索引、终止索引(“在”,“白宫”)
  • A1:表示“受事者”,8, 8分别表示LOC所在的起始索引、终止索引(“演说”)

三、关系抽取【三元组(Triple、主谓宾) 抽取】

  1. import os
  2. import re
  3. from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller
  4. # pip install pyltp -i https://pypi.tuna.tsinghua.edu.cn/simple 可以先下载好whl文件
  5. #LTP语言平台:http://ltp.ai/index.html
  6. #咱们使用的工具包,pyltp:https://pyltp.readthedocs.io/zh_CN/latest/api.html
  7. #LTP附录:https://ltp.readthedocs.io/zh_CN/latest/appendix.html#id3
  8. #安装方法:https://github.com/HIT-SCIR/pyltp
  9. class LtpParser:
  10. def __init__(self):
  11. LTP_DIR = "./ltp_data_v3.4.0"
  12. self.segmentor = Segmentor() # 分词
  13. self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))
  14. self.postagger = Postagger() # 词性标注
  15. self.postagger.load(os.path.join(LTP_DIR, "pos.model"))
  16. self.parser = Parser() # 句法依存分析
  17. self.parser.load(os.path.join(LTP_DIR, "parser.model"))
  18. self.recognizer = NamedEntityRecognizer() # 命名实体识别
  19. self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))
  20. self.labeller = SementicRoleLabeller() # 语义角色标注
  21. self.labeller.load(os.path.join(LTP_DIR, 'pisrl_win.model'))
  22. # 依存句法分析【为句子中的每个词语维护一个保存句法依存儿子节点的字典】
  23. def build_parse_child_dict(self, words, postags): # words:分词后的结果;postags:词性标注后的结果;arcs:依存句法分析树
  24. print("-" * 50, "依存句法分析:开始", "-" * 50)
  25. child_dict_list = []
  26. format_parse_list = []
  27. arcs = self.parser.parse(words, postags) # 建立依存句法分析树
  28. print("分词列表:words = {}".format(words))
  29. print("词性分析:postags = {}".format(postags))
  30. rely_ids = [arc.head - 1 for arc in arcs] # 提取该句话的每一个词的依存父节点id【0为ROOT,词语从1开始编号】: [2, 0, 2, 5, 8, 8, 6, 3] - 1 = [1, -1, 1, 4, 7, 7, 5, 2]【此时 -1 表示ROOT】
  31. print("各个词语所依赖的父节点:rely_ids = {0}".format(rely_ids))
  32. heads = ['Root' if rely_id == -1 else words[rely_id] for rely_id in rely_ids] # 匹配依存父节点词语
  33. print("各个词语所依赖的父节点词语 = {0}".format(heads))
  34. relations = [arc.relation for arc in arcs] # 提取依存关系
  35. print("各个词语与所依赖的父节点的依赖关系 = {0}".format(relations))
  36. for word_index in range(len(words)):
  37. print("\nword_index = {0}----word = {1}".format(word_index, words[word_index]))
  38. child_dict = dict() # 每个词语与所有其他词语的关系字典
  39. for arc_index in range(len(arcs)): # arc_index==0时表示ROOT【还没进入“我想听一首迪哥的歌”语句】,arc_index==1时表示“我”
  40. # 当“依存句法分析树”遍历,遇到当前词语时,说明当前词语在依存句法分析树中与其他词语有依存关系
  41. if word_index == rely_ids[arc_index]: # arcs[arc_index].head 表示arcs[arc_index]所代表的词语依存弧的父结点的索引。 ROOT 节点的索引是 0 ,第一个词开始的索引依次为1,2,3,···【“我”的索引为1】arc. relation 表示依存弧的关系。
  42. print("word_index = {0}----arc_index = {1}----rely_ids[arc_index] = {2}----relations[arc_index] = {3}".format(word_index, arc_index, rely_ids[arc_index], relations[arc_index]))
  43. if relations[arc_index] in child_dict: # arcs[arc_index].relation表示arcs[arc_index]所代表的词语与父节点的依存关系(语法关系)
  44. child_dict[relations[arc_index]].append(arc_index) # 添加 child_dict = {'ATT': [4]}----> child_dict = {'ATT': [4, 5]}
  45. else:
  46. child_dict[relations[arc_index]] = [] # 新建
  47. child_dict[relations[arc_index]].append(arc_index) # child_dict = {[]}----> child_dict = {'ATT': [4]}
  48. print("child_dict = {0}".format(child_dict))
  49. child_dict_list.append(child_dict)# 每个词对应的依存关系父节点和其关系
  50. print("child_dict_list = {0}".format(child_dict_list))
  51. # 整合每个词语的句法依存关系
  52. for i in range(len(words)):
  53. a = [relations[i], words[i], i, postags[i], heads[i], rely_ids[i]-1, postags[rely_ids[i]-1]]
  54. format_parse_list.append(a)
  55. print("整合每个词语的句法依存关系---->format_parse_list = ", format_parse_list)
  56. print("-" * 50, "依存句法分析:结束", "-" * 50)
  57. return child_dict_list, format_parse_list
  58. # 语义角色标注
  59. def format_labelrole(self, words, postags):
  60. print("-"*50, "语义角色标注:开始", "-"*50)
  61. print("分词----> words= {0}----len(words) = {1}".format(words, len(words)))
  62. print("词性标注----> postags= {0}----len(postags) = {1}".format(postags, len(postags)))
  63. arcs = self.parser.parse(words, postags) # 建立依存句法分析树
  64. roles = self.labeller.label(words, postags, arcs)
  65. print("len(roles) = {0}----roles = {1}".format(len(roles), roles))
  66. roles_dict = {
  67. }
  68. for role in roles:
  69. print("谓语所在索引:role.index = {0}".format(role.index))
  70. roles_dict[role.index] = {
  71. arg.name:[arg.name,arg.range.start, arg.range.end] for arg in role.arguments}
  72. # {6: {'A0': ['A0', 0, 2], 'TMP': ['TMP', 3, 3], 'LOC': ['LOC', 4, 5], 'A1': ['A1', 8, 8]}}
  73. # 6:表示谓语(发表)所在序号;
  74. # A0:表示“施事者、主体、触发者”,0,2分别表示A0所在的起始索引、终止索引(此句中有2个A0,分别是“奥巴马”、“克林顿”,索引范围是是0-2)
  75. # TMP:表示“时间”,3, 3分别表示TMP所在的起始索引、终止索引(“昨晚”)
  76. # LOC:表示“地点”,4, 5分别表示LOC所在的起始索引、终止索引(“在”,“白宫”)
  77. # A1:表示“受事者”,8, 8分别表示LOC所在的起始索引、终止索引(“演说”)
  78. print("语义角色标注---->roles_dict = {0}".format(roles_dict))
  79. print("-" * 50, "语义角色标注:结束", "-" * 50)
  80. return roles_dict
  81. # parser主函数
  82. def parser_main(self, sentence):
  83. # 分词
  84. words = list(self.segmentor.segment(sentence))
  85. # 词性标注
  86. postags = list(self.postagger.postag(words))
  87. # 依存句法分析
  88. child_dict_list, format_parse_list = self.build_parse_child_dict(words, postags)
  89. # 语义角色标注
  90. roles_dict = self.format_labelrole(words, postags)
  91. return words, postags, child_dict_list, format_parse_list, roles_dict
  92. # 关系抽取类
  93. class TripleExtractor:
  94. def __init__(self):
  95. self.parser = LtpParser()
  96. '''文章分句处理, 切分长句,冒号,分号,感叹号等做切分标识'''
  97. def split_sents(self, content):
  98. return [sentence for sentence in re.split(r'[??!!。;;::\n\r]', content) if sentence]
  99. '''利用语义角色标注,直接获取主谓宾三元组,基于A0,A1,A2'''
  100. def ruler1(self, words, postags, roles_dict, role_index):
  101. v = words[role_index]
  102. role_info = roles_dict[role_index]
  103. if 'A0' in role_info.keys() and 'A1' in role_info.keys():
  104. s = ''.join([words[word_index] for word_index in range(role_info['A0'][1], role_info['A0'][2]+1) if
  105. postags[word_index][0] not in ['w', 'u', 'x'] and words[word_index]])
  106. o = ''.join([words[word_index] for word_index in range(role_info['A1'][1], role_info['A1'][2]+1) if
  107. postags[word_index][0] not in ['w', 'u', 'x'] and words[word_index]])
  108. if s and o:
  109. return '1', [s, v, o]
  110. return '4', []
  111. '''三元组抽取主函数'''
  112. def ruler2(self, words, postags, child_dict_list, format_parse_list, roles_dict):
  113. svos = []
  114. for index in range(len(postags)):
  115. tmp = 1
  116. # 先借助语义角色标注的结果,进行三元组抽取
  117. if index in roles_dict:
  118. flag, triple = self.ruler1(words, postags, roles_dict, index)
  119. if flag == '1':
  120. svos.append(triple)
  121. tmp = 0
  122. if tmp == 1:
  123. # 如果语义角色标记为空,则使用依存句法进行抽取
  124. # if postags[index] == 'v':
  125. if postags[index]:
  126. # 抽取以谓词为中心的事实三元组
  127. child_dict = child_dict_list[index]
  128. # 主谓宾
  129. if 'SBV' in child_dict and 'VOB' in child_dict:
  130. r = words[index]
  131. e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
  132. e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
  133. svos.append([e1, r, e2])
  134. # 定语后置,动宾关系
  135. relation = format_parse_list[index][0]
  136. head = format_parse_list[index][2]
  137. if relation == 'ATT':
  138. if 'VOB' in child_dict:
  139. e1 = self.complete_e(words, postags, child_dict_list, head - 1)
  140. r = words[index]
  141. e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
  142. temp_string = r + e2
  143. if temp_string == e1[:len(temp_string)]:
  144. e1 = e1[len(temp_string):]
  145. if temp_string not in e1:
  146. svos.append([e1, r, e2])
  147. # 含有介宾关系的主谓动补关系
  148. if 'SBV' in child_dict and 'CMP' in child_dict:
  149. e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
  150. cmp_index = child_dict['CMP'][0]
  151. r = words[index] + words[cmp_index]
  152. if 'POB' in child_dict_list[cmp_index]:
  153. e2 = self.complete_e(words, postags, child_dict_list, child_dict_list[cmp_index]['POB'][0])
  154. svos.append([e1, r, e2])
  155. return svos
  156. '''对找出的主语或者宾语进行扩展:【定中关系 ATT 红苹果 (红 <– 苹果)】'''
  157. def complete_e(self, words, postags, child_dict_list, word_index):
  158. child_dict = child_dict_list[word_index]
  159. prefix = ''
  160. if 'ATT' in child_dict:
  161. for i in range(len(child_dict['ATT'])):
  162. prefix += self.complete_e(words, postags, child_dict_list, child_dict['ATT'][i])
  163. postfix = ''
  164. if postags[word_index] == 'v':
  165. if 'VOB' in child_dict:
  166. postfix += self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
  167. if 'SBV' in child_dict:
  168. prefix = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) + prefix
  169. return prefix + words[word_index] + postfix
  170. '''程序主控函数'''
  171. def triples_main(self, text):
  172. sentences = self.split_sents(text)
  173. svos = []
  174. for index, sentence in enumerate(sentences):
  175. print("="*50, "第{}句:开始".format(index + 1), "="*50)
  176. # words: 分词; postags: 词性标注; child_dict_list: 依存句法分析; roles_dict: 语义角色标注
  177. words, postags, child_dict_list, format_parse_list, roles_dict = self.parser.parser_main(sentence)
  178. svo = self.ruler2(words, postags, child_dict_list, format_parse_list, roles_dict)
  179. print("svo = {0}".format(svo))
  180. print("=" * 50, "第{}句:结束".format(index + 1), "=" * 50)
  181. svos += svo
  182. return svos
  183. # 关系抽取
  184. def run_extractor(text):
  185. extractor = TripleExtractor()
  186. svos = extractor.triples_main(text)
  187. return svos
  188. if __name__ == '__main__':
  189. # 关系抽取
  190. # text = '奥巴马与克林顿昨晚在白宫发表了演说'
  191. text = '我购买了一件玩具,孩子非常喜欢这个玩具,但是质量不太好。希望商家能够保障商品质量,不要再出现类似问题。'
  192. svos = run_extractor(text)
  193. print("关系抽取结果:svos = {0}".format(svos))

打印结果:

  1. ================================================== 1句:开始 ==================================================
  2. -------------------------------------------------- 依存句法分析:开始 --------------------------------------------------
  3. 分词列表:words = ['我', '购买', '了', '一', '件', '玩具', ',', '孩子', '非常', '喜欢', '这个', '玩具', ',', '但是', '质量', '不', '太', '好']
  4. 词性分析:postags = ['r', 'v', 'u', 'm', 'q', 'n', 'wp', 'n', 'd', 'v', 'r', 'n', 'wp', 'c', 'n', 'd', 'd', 'a']
  5. 各个词语所依赖的父节点:rely_ids = [1, -1, 1, 4, 5, 1, 1, 9, 9, 1, 11, 9, 9, 17, 17, 17, 17, 9]
  6. 各个词语所依赖的父节点词语 = ['购买', 'Root', '购买', '件', '玩具', '购买', '购买', '喜欢', '喜欢', '购买', '玩具', '喜欢', '喜欢', '好', '好', '好', '好', '喜欢']
  7. 各个词语与所依赖的父节点的依赖关系 = ['SBV', 'HED', 'RAD', 'ATT', 'ATT', 'VOB', 'WP', 'SBV', 'ADV', 'COO', 'ATT', 'VOB', 'WP', 'ADV', 'SBV', 'ADV', 'ADV', 'COO']
  8. word_index = 0----word =
  9. child_dict_list = [{
  10. }]
  11. word_index = 1----word = 购买
  12. word_index = 1----arc_index = 0----rely_ids[arc_index] = 1----relations[arc_index] = SBV
  13. child_dict = {
  14. 'SBV': [0]}
  15. word_index = 1----arc_index = 2----rely_ids[arc_index] = 1----relations[arc_index] = RAD
  16. child_dict = {
  17. 'SBV': [0], 'RAD': [2]}
  18. word_index = 1----arc_index = 5----rely_ids[arc_index] = 1----relations[arc_index] = VOB
  19. child_dict = {
  20. 'SBV': [0], 'RAD': [2], 'VOB': [5]}
  21. word_index = 1----arc_index = 6----rely_ids[arc_index] = 1----relations[arc_index] = WP
  22. child_dict = {
  23. 'SBV': [0], 'RAD': [2], 'VOB': [5], 'WP': [6]}
  24. word_index = 1----arc_index = 9----rely_ids[arc_index] = 1----relations[arc_index] = COO
  25. child_dict = {
  26. 'SBV': [0], 'RAD': [2], 'VOB': [5], 'WP': [6], 'COO': [9]}
  27. child_dict_list = [{
  28. }, {
  29. 'SBV': [0], 'RAD': [2], 'VOB': [5], 'WP': [6], 'COO': [9]}]
  30. word_index = 2----word =
  31. child_dict_list = [{
  32. }, {
  33. 'SBV': [0], 'RAD': [2], 'VOB': [5], 'WP': [6], 'COO': [9]}, {
  34. }]
  35. word_index = 3----word =
  36. child_dict_list = [{
  37. }, {
  38. 'SBV': [0], 'RAD': [2], 'VOB': [5], 'WP': [6], 'COO': [9]}, {
  39. }, {
  40. }]
  41. word_index = 4----word =
  42. word_index = 4----arc_index = 3----rely_ids[arc_index] = 4----relations[arc_index] = ATT
  43. child_dict = {
  44. 'ATT': [3]}
  45. child_dict_list = [{
  46. }, {
  47. 'SBV': [0], 'RAD': [2], 'VOB': [5], 'WP': [6], 'COO': [9]}, {
  48. }, {
  49. }, {
  50. 'ATT': [3]}]
  51. word_index = 5----word = 玩具
  52. word_index = 5----arc_index = 4----rely_ids[arc_index] = 5----relations[arc_index] = ATT
  53. child_dict = {
  54. 'ATT': [4]}
  55. child_dict_list = [{
  56. }, {
  57. 'SBV': [0], 'RAD': [2], 'VOB': [5], 'WP': [6], 'COO': [9]}, {
  58. }, {
  59. }, {
  60. 'ATT': [3]}, {
  61. 'ATT': [4]}]
  62. word_index = 6----word =
  63. child_dict_list = [{
  64. }, {
  65. 'SBV': [0], 'RAD': [2], 'VOB': [5], 'WP': [6], 'COO': [9]}, {
  66. }, {
  67. }, {
  68. 'ATT': [3]}, {
  69. 'ATT': [4]}, {
  70. }]
  71. word_index = 7----word = 孩子
  72. child_dict_list = [{
  73. }, {
  74. 'SBV': [0], 'RAD': [2], 'VOB': [5], 'WP': [6], 'COO': [9]}, {
  75. }, {
  76. }, {
  77. 'ATT': [3]}, {
  78. 'ATT': [4]}, {
  79. }, {
  80. }]
  81. word_index = 8----word = 非常
  82. child_dict_list = [{
  83. }, {
  84. 'SBV': [0], 'RAD': [2], 'VOB': [5], 'WP': [6], 'COO': [9]}, {
  85. }, {
  86. }, {
  87. 'ATT': [3]}, {
  88. 'ATT': [4]}, {
  89. }, {
  90. }, {
  91. }]
  92. word_index = 9----word = 喜欢
  93. word_index = 9----arc_index = 7----rely_ids[arc_index] = 9----relations[arc_index] = SBV
  94. child_dict = {
  95. 'SBV': [7]}
  96. word_index = 9----arc_index = 8----rely_ids[arc_index] = 9----relations[arc_index] = ADV
  97. child_dict = {
  98. 'SBV': [7], 'ADV': [8]}
  99. word_index = 9----arc_index = 11----rely_ids[arc_index] = 9----relations[arc_index] = VOB
  100. child_dict = {
  101. 'SBV': [7], 'ADV': [8], 'VOB': [11]}
  102. word_index = 9----arc_index = 12----rely_ids[arc_index] = 9----relations[arc_index] = WP
  103. child_dict = {
  104. 'SBV': [7], 'ADV': [8], 'VOB': [11], 'WP': [12]}
  105. word_index = 9----arc_index = 17----rely_ids[arc_index] = 9----relations[arc_index] = COO
  106. child_dict = {
  107. 'SBV': [7], 'ADV': [8], 'VOB': [11], 'WP': [12], 'COO': [17]}
  108. child_dict_list = [{
  109. }, {
  110. 'SBV': [0], 'RAD': [2], 'VOB': [5], 'WP': [6], 'COO': [9]}, {
  111. }, {
  112. }, {
  113. 'ATT': [3]}, {
  114. 'ATT': [4]}, {
  115. }, {
  116. }, {
  117. }, {
  118. 'SBV': [7], 'ADV': [8], 'VOB': [11], 'WP': [12], 'COO': [17]}]
  119. word_index = 10----word = 这个
  120. child_dict_list = [{
  121. }, {
  122. 'SBV': [0], 'RAD': [2], 'VOB': [5], 'WP': [6], 'COO': [9]}, {
  123. }, {
  124. }, {
  125. 'ATT': [3]}, {
  126. 'ATT': [4]}, {
  127. }, {
  128. }, {
  129. }, {
  130. 'SBV': [7], 'ADV': [8], 'VOB': [11], 'WP': [12], 'COO': [17]}, {
  131. }]
  132. word_index = 11----word = 玩具
  133. word_index = 11----arc_index = 10----rely_ids[arc_index] = 11----relations[arc_index] = ATT
  134. child_dict = {
  135. 'ATT': [10]}
  136. child_dict_list = [{
  137. }, {
  138. 'SBV': [0], 'RAD': [2], 'VOB': [5], 'WP': [6], 'COO': [9]}, {
  139. }, {
  140. }, {
  141. 'ATT': [3]}, {
  142. 'ATT': [4]}, {
  143. }, {
  144. }, {
  145. }, {
  146. 'SBV': [7], 'ADV': [8], 'VOB': [11], 'WP': [12], 'COO': [17]}, {
  147. }, {
  148. 'ATT': [10]}]
  149. word_index = 12----word =
  150. child_dict_list = [{
  151. }, {
  152. 'SBV': [0], 'RAD': [2], 'VOB': [5], 'WP': [6], 'COO': [9]}, {
  153. }, {
  154. }, {
  155. 'ATT': [3]}, {
  156. 'ATT': [4]}, {
  157. }, {
  158. }, {
  159. }, {
  160. 'SBV': [7], 'ADV': [8], 'VOB': [11], 'WP': [12], 'COO': [17]}, {
  161. }, {
  162. 'ATT': [10]}, {
  163. }]
  164. word_index = 13----word = 但是
  165. child_dict_list = [{
  166. }, {
  167. 'SBV': [0], 'RAD': [2], 'VOB': [5], 'WP': [6], 'COO': [9]}, {
  168. }, {
  169. }, {
  170. 'ATT': [3]}, {
  171. 'ATT': [4]}, {
  172. }, {
  173. }, {
  174. }, {
  175. 'SBV': [7], 'ADV': [8], 'VOB': [11], 'WP': [12], 'COO': [17]}, {
  176. }, {
  177. 'ATT': [10]}, {
  178. }, {
  179. }]
  180. word_index = 14----word = 质量
  181. child_dict_list = [{
  182. }, {
  183. 'SBV': [0], 'RAD': [2], 'VOB': [5], 'WP': [6], 'COO': [9]}, {
  184. }, {
  185. }, {
  186. 'ATT': [3]}, {
  187. 'ATT': [4]}, {
  188. }, {
  189. }, {
  190. }, {
  191. 'SBV': [7], 'ADV': [8], 'VOB': [11], 'WP': [12], 'COO': [17]}, {
  192. }, {
  193. 'ATT': [10]}, {
  194. }, {
  195. }, {
  196. }]
  197. word_index = 15----word =
  198. child_dict_list = [{
  199. }, {
  200. 'SBV': [0], 'RAD': [2], 'VOB': [5], 'WP': [6], 'COO': [9]}, {
  201. }, {
  202. }, {
  203. 'ATT': [3]}, {
  204. 'ATT': [4]}, {
  205. }, {
  206. }, {
  207. }, {
  208. 'SBV': [7], 'ADV': [8], 'VOB': [11], 'WP': [12], 'COO': [17]}, {
  209. }, {
  210. 'ATT': [10]}, {
  211. }, {
  212. }, {
  213. }, {
  214. }]
  215. word_index = 16----word =
  216. child_dict_list = [{
  217. }, {
  218. 'SBV': [0], 'RAD': [2], 'VOB': [5], 'WP': [6], 'COO': [9]}, {
  219. }, {
  220. }, {
  221. 'ATT': [3]}, {
  222. 'ATT': [4]}, {
  223. }, {
  224. }, {
  225. }, {
  226. 'SBV': [7], 'ADV': [8], 'VOB': [11], 'WP': [12], 'COO': [17]}, {
  227. }, {
  228. 'ATT': [10]}, {
  229. }, {
  230. }, {
  231. }, {
  232. }, {
  233. }]
  234. word_index = 17----word =
  235. word_index = 17----arc_index = 13----rely_ids[arc_index] = 17----relations[arc_index] = ADV
  236. child_dict = {
  237. 'ADV': [13]}
  238. word_index = 17----arc_index = 14----rely_ids[arc_index] = 17----relations[arc_index] = SBV
  239. child_dict = {
  240. 'ADV': [13], 'SBV': [14]}
  241. word_index = 17----arc_index = 15----rely_ids[arc_index] = 17----relations[arc_index] = ADV
  242. child_dict = {
  243. 'ADV': [13, 15], 'SBV': [14]}
  244. word_index = 17----arc_index = 16----rely_ids[arc_index] = 17----relations[arc_index] = ADV
  245. child_dict = {
  246. 'ADV': [13, 15, 16], 'SBV': [14]}
  247. child_dict_list = [{
  248. }, {
  249. 'SBV': [0], 'RAD': [2], 'VOB': [5], 'WP': [6], 'COO': [9]}, {
  250. }, {
  251. }, {
  252. 'ATT': [3]}, {
  253. 'ATT': [4]}, {
  254. }, {
  255. }, {
  256. }, {
  257. 'SBV': [7], 'ADV': [8], 'VOB': [11], 'WP': [12], 'COO': [17]}, {
  258. }, {
  259. 'ATT': [10]}, {
  260. }, {
  261. }, {
  262. }, {
  263. }, {
  264. }, {
  265. 'ADV': [13, 15, 16], 'SBV': [14]}]
  266. 整合每个词语的句法依存关系---->format_parse_list = [['SBV', '我', 0, 'r', '购买', 0, 'r'], ['HED', '购买', 1, 'v', 'Root', -2, 'd'], ['RAD', '了', 2, 'u', '购买', 0, 'r'], ['ATT', '一', 3, 'm', '件', 3, 'm'], ['ATT', '件', 4, 'q', '玩具', 4, 'q'], ['VOB', '玩具', 5, 'n', '购买', 0, 'r'], ['WP', ',', 6, 'wp', '购买', 0, 'r'], ['SBV', '孩子', 7, 'n', '喜欢', 8, 'd'], ['ADV', '非常', 8, 'd', '喜欢', 8, 'd'], ['COO', '喜欢', 9, 'v', '购买', 0, 'r'], ['ATT', '这个', 10, 'r', '玩具', 10, 'r'], ['VOB', '玩具', 11, 'n', '喜欢', 8, 'd'], ['WP', ',', 12, 'wp', '喜欢', 8, 'd'], ['ADV', '但是', 13, 'c', '好', 16, 'd'], ['SBV', '质量', 14, 'n', '好', 16, 'd'], ['ADV', '不', 15, 'd', '好', 16, 'd'], ['ADV', '太', 16, 'd', '好', 16, 'd'], ['COO', '好', 17, 'a', '喜欢', 8, 'd']]
  267. -------------------------------------------------- 依存句法分析:结束 --------------------------------------------------
  268. -------------------------------------------------- 语义角色标注:开始 --------------------------------------------------
  269. 分词----> words= ['我', '购买', '了', '一', '件', '玩具', ',', '孩子', '非常', '喜欢', '这个', '玩具', ',', '但是', '质量', '不', '太', '好']----len(words) = 18
  270. 词性标注----> postags= ['r', 'v', 'u', 'm', 'q', 'n', 'wp', 'n', 'd', 'v', 'r', 'n', 'wp', 'c', 'n', 'd', 'd', 'a']----len(postags) = 18
  271. len(roles) = 2----roles = <pyltp.SementicRoles object at 0x0000026FF8D2E870>
  272. 谓语所在索引:role.index = 1
  273. 谓语所在索引:role.index = 17
  274. 语义角色标注---->roles_dict = {
  275. 1: {
  276. 'A0': ['A0', 0, 0], 'A1': ['A1', 3, 5]}, 17: {
  277. 'DIS': ['DIS', 13, 13], 'A0': ['A0', 14, 14], 'ADV': ['ADV', 16, 16]}}
  278. -------------------------------------------------- 语义角色标注:结束 --------------------------------------------------
  279. svo = [['我', '购买', '一件玩具'], ['孩子', '喜欢', '这个玩具']]
  280. ================================================== 1句:结束 ==================================================
  281. ================================================== 2句:开始 ==================================================
  282. -------------------------------------------------- 依存句法分析:开始 --------------------------------------------------
  283. 分词列表:words = ['希望', '商家', '能够', '保障', '商品', '质量', ',', '不要', '再', '出现', '类似', '问题']
  284. 词性分析:postags = ['v', 'n', 'v', 'v', 'n', 'n', 'wp', 'd', 'd', 'v', 'v', 'n']
  285. 各个词语所依赖的父节点:rely_ids = [-1, 3, 3, 0, 5, 3, 3, 9, 9, 3, 11, 9]
  286. 各个词语所依赖的父节点词语 = ['Root', '保障', '保障', '希望', '质量', '保障', '保障', '出现', '出现', '保障', '问题', '出现']
  287. 各个词语与所依赖的父节点的依赖关系 = ['HED', 'SBV', 'ADV', 'VOB', 'ATT', 'VOB', 'WP', 'ADV', 'ADV', 'COO', 'ATT', 'VOB']
  288. word_index = 0----word = 希望
  289. word_index = 0----arc_index = 3----rely_ids[arc_index] = 0----relations[arc_index] = VOB
  290. child_dict = {
  291. 'VOB': [3]}
  292. child_dict_list = [{
  293. 'VOB': [3]}]
  294. word_index = 1----word = 商家
  295. child_dict_list = [{
  296. 'VOB': [3]}, {
  297. }]
  298. word_index = 2----word = 能够
  299. child_dict_list = [{
  300. 'VOB': [3]}, {
  301. }, {
  302. }]
  303. word_index = 3----word = 保障
  304. word_index = 3----arc_index = 1----rely_ids[arc_index] = 3----relations[arc_index] = SBV
  305. child_dict = {
  306. 'SBV': [1]}
  307. word_index = 3----arc_index = 2----rely_ids[arc_index] = 3----relations[arc_index] = ADV
  308. child_dict = {
  309. 'SBV': [1], 'ADV': [2]}
  310. word_index = 3----arc_index = 5----rely_ids[arc_index] = 3----relations[arc_index] = VOB
  311. child_dict = {
  312. 'SBV': [1], 'ADV': [2], 'VOB': [5]}
  313. word_index = 3----arc_index = 6----rely_ids[arc_index] = 3----relations[arc_index] = WP
  314. child_dict = {
  315. 'SBV': [1], 'ADV': [2], 'VOB': [5], 'WP': [6]}
  316. word_index = 3----arc_index = 9----rely_ids[arc_index] = 3----relations[arc_index] = COO
  317. child_dict = {
  318. 'SBV': [1], 'ADV': [2], 'VOB': [5], 'WP': [6], 'COO': [9]}
  319. child_dict_list = [{
  320. 'VOB': [3]}, {
  321. }, {
  322. }, {
  323. 'SBV': [1], 'ADV': [2], 'VOB': [5], 'WP': [6], 'COO': [9]}]
  324. word_index = 4----word = 商品
  325. child_dict_list = [{
  326. 'VOB': [3]}, {
  327. }, {
  328. }, {
  329. 'SBV': [1], 'ADV': [2], 'VOB': [5], 'WP': [6], 'COO': [9]}, {
  330. }]
  331. word_index = 5----word = 质量
  332. word_index = 5----arc_index = 4----rely_ids[arc_index] = 5----relations[arc_index] = ATT
  333. child_dict = {
  334. 'ATT': [4]}
  335. child_dict_list = [{
  336. 'VOB': [3]}, {
  337. }, {
  338. }, {
  339. 'SBV': [1], 'ADV': [2], 'VOB': [5], 'WP': [6], 'COO': [9]}, {
  340. }, {
  341. 'ATT': [4]}]
  342. word_index = 6----word =
  343. child_dict_list = [{
  344. 'VOB': [3]}, {
  345. }, {
  346. }, {
  347. 'SBV': [1], 'ADV': [2], 'VOB': [5], 'WP': [6], 'COO': [9]}, {
  348. }, {
  349. 'ATT': [4]}, {
  350. }]
  351. word_index = 7----word = 不要
  352. child_dict_list = [{
  353. 'VOB': [3]}, {
  354. }, {
  355. }, {
  356. 'SBV': [1], 'ADV': [2], 'VOB': [5], 'WP': [6], 'COO': [9]}, {
  357. }, {
  358. 'ATT': [4]}, {
  359. }, {
  360. }]
  361. word_index = 8----word =
  362. child_dict_list = [{
  363. 'VOB': [3]}, {
  364. }, {
  365. }, {
  366. 'SBV': [1], 'ADV': [2], 'VOB': [5], 'WP': [6], 'COO': [9]}, {
  367. }, {
  368. 'ATT': [4]}, {
  369. }, {
  370. }, {
  371. }]
  372. word_index = 9----word = 出现
  373. word_index = 9----arc_index = 7----rely_ids[arc_index] = 9----relations[arc_index] = ADV
  374. child_dict = {
  375. 'ADV': [7]}
  376. word_index = 9----arc_index = 8----rely_ids[arc_index] = 9----relations[arc_index] = ADV
  377. child_dict = {
  378. 'ADV': [7, 8]}
  379. word_index = 9----arc_index = 11----rely_ids[arc_index] = 9----relations[arc_index] = VOB
  380. child_dict = {
  381. 'ADV': [7, 8], 'VOB': [11]}
  382. child_dict_list = [{
  383. 'VOB': [3]}, {
  384. }, {
  385. }, {
  386. 'SBV': [1], 'ADV': [2], 'VOB': [5], 'WP': [6], 'COO': [9]}, {
  387. }, {
  388. 'ATT': [4]}, {
  389. }, {
  390. }, {
  391. }, {
  392. 'ADV': [7, 8], 'VOB': [11]}]
  393. word_index = 10----word = 类似
  394. child_dict_list = [{
  395. 'VOB': [3]}, {
  396. }, {
  397. }, {
  398. 'SBV': [1], 'ADV': [2], 'VOB': [5], 'WP': [6], 'COO': [9]}, {
  399. }, {
  400. 'ATT': [4]}, {
  401. }, {
  402. }, {
  403. }, {
  404. 'ADV': [7, 8], 'VOB': [11]}, {
  405. }]
  406. word_index = 11----word = 问题
  407. word_index = 11----arc_index = 10----rely_ids[arc_index] = 11----relations[arc_index] = ATT
  408. child_dict = {
  409. 'ATT': [10]}
  410. child_dict_list = [{
  411. 'VOB': [3]}, {
  412. }, {
  413. }, {
  414. 'SBV': [1], 'ADV': [2], 'VOB': [5], 'WP': [6], 'COO': [9]}, {
  415. }, {
  416. 'ATT': [4]}, {
  417. }, {
  418. }, {
  419. }, {
  420. 'ADV': [7, 8], 'VOB': [11]}, {
  421. }, {
  422. 'ATT': [10]}]
  423. 整合每个词语的句法依存关系---->format_parse_list = [['HED', '希望', 0, 'v', 'Root', -2, 'v'], ['SBV', '商家', 1, 'n', '保障', 2, 'v'], ['ADV', '能够', 2, 'v', '保障', 2, 'v'], ['VOB', '保障', 3, 'v', '希望', -1, 'n'], ['ATT', '商品', 4, 'n', '质量', 4, 'n'], ['VOB', '质量', 5, 'n', '保障', 2, 'v'], ['WP', ',', 6, 'wp', '保障', 2, 'v'], ['ADV', '不要', 7, 'd', '出现', 8, 'd'], ['ADV', '再', 8, 'd', '出现', 8, 'd'], ['COO', '出现', 9, 'v', '保障', 2, 'v'], ['ATT', '类似', 10, 'v', '问题', 10, 'v'], ['VOB', '问题', 11, 'n', '出现', 8, 'd']]
  424. -------------------------------------------------- 依存句法分析:结束 --------------------------------------------------
  425. -------------------------------------------------- 语义角色标注:开始 --------------------------------------------------
  426. 分词----> words= ['希望', '商家', '能够', '保障', '商品', '质量', ',', '不要', '再', '出现', '类似', '问题']----len(words) = 12
  427. 词性标注----> postags= ['v', 'n', 'v', 'v', 'n', 'n', 'wp', 'd', 'd', 'v', 'v', 'n']----len(postags) = 12
  428. len(roles) = 4----roles = <pyltp.SementicRoles object at 0x0000026FF8D2E870>
  429. 谓语所在索引:role.index = 0
  430. 谓语所在索引:role.index = 3
  431. 谓语所在索引:role.index = 7
  432. 谓语所在索引:role.index = 9
  433. 语义角色标注---->roles_dict = {
  434. 0: {
  435. 'A1': ['A1', 1, 11]}, 3: {
  436. 'A0': ['A0', 1, 1], 'A1': ['A1', 4, 5]}, 7: {
  437. 'A0': ['A0', 1, 1]}, 9: {
  438. 'ADV': ['ADV', 8, 8], 'A1': ['A1', 10, 11]}}
  439. -------------------------------------------------- 语义角色标注:结束 --------------------------------------------------
  440. svo = [['商家', '保障', '商品质量']]
  441. ================================================== 2句:结束 ==================================================
  442. 关系抽取结果:svos = [['我', '购买', '一件玩具'], ['孩子', '喜欢', '这个玩具'], ['商家', '保障', '商品质量']]
  443. Process finished with exit code 0

四、构建知识图谱

通过上一步骤的关系抽取的三元组结果【A0(触发者)谓语词A1(受事者)】就可以创建知识图谱的实体与关系。

发表评论

表情:
评论列表 (有 0 条评论,65人围观)

还没有评论,来说两句吧...

相关阅读