自学内容网 自学内容网

从文本构建金庸家族知识图谱

针对对金庸家族的文本描述构建家族知识图谱。

"金庸的姐夫是钱学森,钱学森的妻子是蒋英,金庸的表哥是徐志摩,金庸的堂姐是琼瑶,金庸有一个哥哥,名叫查良钊。金庸有个儿子是查传侠"

本体模型:人与人的亲戚关系(

"表哥": "brother_low",
"姐夫": "sister_husband",
"妻子": "wife",
"堂姐":"cousin",
"父亲": "father",
"母亲": "mother",
"哥哥": "brother",
"姐姐": "sister",
"弟弟": "brother",
"妹妹": "sister",
"儿子": "son",
"女儿": "daughter")

步骤1: 安装必要的库

确保你已经安装了所需的库:

pip install torch transformers py2neo

步骤2: 加载预训练模型

加载 Hugging Face 的 BERT 模型进行命名实体识别:

from transformers import BertTokenizer, BertForTokenClassification
import torch
from py2neo import Graph, Node, Relationship

# 加载tokenizer和model
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertForTokenClassification.from_pretrained('ckiplab/bert-base-chinese-ner')

步骤3: 文本预处理

准备一段关于金庸家族的文本,并对其进行预处理:

# 文本预处理
text = "金庸的姐夫是钱学森,钱学森的妻子是蒋英,金庸的表哥是徐志摩,金庸的堂姐是琼瑶,金庸有一个哥哥,名叫查良钊。金庸有个儿子是查传侠"
inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

步骤4: 实体识别

使用模型预测文本中的实体:

# 实体识别
outputs = model(**inputs)
predictions = torch.argmax(outputs.logits, dim=2).squeeze()

# 解码预测结果
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
entities = []
current_entity = []
current_label = None

# 遍历每个token和它的预测标签
for token, prediction in zip(tokens, predictions):
    label = model.config.id2label[prediction.item()]

    if label.startswith("B-"):  # 实体开始
        if current_entity:  # 如果有正在处理的实体,先存起来
            entities.append((current_entity, current_label))
        current_entity = [token]  # 新建一个实体
        current_label = label[2:]  # 去掉B-,记录实体类别
    elif label.startswith("I-") or label.startswith("E-"):  # 实体内部或结束
        current_entity.append(token)  # 将token添加到当前实体
        if label.startswith("E-"):  # 如果是实体结束,存储实体
            entities.append((current_entity, current_label))
            current_entity = []
            current_label = None
    else:
        if current_entity:  # 处理非实体标签时,存储当前实体
            entities.append((current_entity, current_label))
            current_entity = []
            current_label = None
        entities.append(([token], label))  # 存储非实体token

# 如果最后还有未存储的实体
if current_entity:
    entities.append((current_entity, current_label))

# 合并实体tokens,并修正拆分的关系词
final_entities = []
i = 0
while i < len(entities):
    entity, label = entities[i]
    entity_str = ''.join(entity)

    # 合并"父"和"亲"、"母"和"亲"等关系词
    if i + 1 < len(entities) and ''.join(entity + entities[i + 1][0]) in ["表哥", "姐夫","妻子","堂姐","父亲", "母亲", "哥哥", "姐姐", "弟弟",
                                                                          "妹妹","儿子","女儿"]:
        entity_str = ''.join(entity + entities[i + 1][0])  # 合并
        label = "REL"  # 将合并后的关系词标记为"REL"
        i += 1  # 跳过下一个token
    final_entities.append((entity_str, label))
    i += 1

print("final_entities:", final_entities)

步骤5: 关系抽取

假设我们已经知道了一些固定的关系模式,例如“父亲”、“母亲”等。我们可以基于这些模式来抽取关系:

# 定义关系模式
relation_patterns = {
    "表哥": "brother_low",
    "姐夫": "sister_husband",
    "妻子": "wife",
    "堂姐":"cousin",
    "父亲": "father",
    "母亲": "mother",
    "哥哥": "brother",
    "姐姐": "sister",
    "弟弟": "brother",
    "妹妹": "sister",
    "儿子": "son",
    "女儿": "daughter"
}

# 关系抽取
relations = []
i = 0
while i < len(final_entities):
    token, label = final_entities[i]
    #print("token:", token, "label:", label)
    # 尝试匹配单字关系词或拼接多字关系词
    if token in relation_patterns and label == "REL":
        relation = relation_patterns[token]
       # print("relation:", relation)

        # 查找前面的实体,跳过非实体的词
        j = i - 1
        while j >= 0 and final_entities[j][1] != "PERSON":
            j -= 1
        if j >= 0 and final_entities[j][1] == "PERSON":
            entity1 = final_entities[j][0]

        # 查找后面的实体,跳过非实体的词
        k = i + 1
        while k < len(final_entities) and final_entities[k][1] != "PERSON":
            k += 1
        if k < len(final_entities) and final_entities[k][1] == "PERSON":
            entity2 = final_entities[k][0]

        # 记录关系
        if entity1 and entity2:
            relations.append((entity1, relation, entity2))

    i += 1

print("Extracted relations:", relations)

步骤6: 连接到 Neo4j 数据库

使用 py2neo 库连接到 Neo4j 数据库:

# 连接到 Neo4j 数据库
uri = "bolt://localhost:7687"
user = "neo4j"
password = "swpu@swpu"

graph = Graph(uri, auth=(user, password))
# 清除已有数据(可选)
graph.run("MATCH (n) DETACH DELETE n")

步骤7: 将实体和关系存储到 Neo4j

定义一个函数来将实体和关系存储到 Neo4j 中:

# 将实体和关系存储到 Neo4j
def add_to_neo4j(entity1, relation, entity2):
    node1 = Node("Person", name=entity1)
    node2 = Node("Person", name=entity2)
    rel = Relationship(node1, relation, node2)

    # 合并节点和关系
    graph.merge(node1, "Person", "name")
    graph.merge(node2, "Person", "name")
    graph.create(rel)


for entity1, relation, entity2 in relations:
    add_to_neo4j(entity1, relation, entity2)

完整代码

from transformers import BertTokenizer, BertForTokenClassification
import torch
from py2neo import Graph, Node, Relationship

# 加载tokenizer和model
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertForTokenClassification.from_pretrained('ckiplab/bert-base-chinese-ner')

# 文本预处理
text = "金庸的姐夫是钱学森,钱学森的妻子是蒋英,金庸的表哥是徐志摩,金庸的堂姐是琼瑶,金庸有一个哥哥,名叫查良钊。金庸有个儿子是查传侠"
inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

# 实体识别
outputs = model(**inputs)
predictions = torch.argmax(outputs.logits, dim=2).squeeze()

# 解码预测结果
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
entities = []
current_entity = []
current_label = None

# 遍历每个token和它的预测标签
for token, prediction in zip(tokens, predictions):
    label = model.config.id2label[prediction.item()]

    if label.startswith("B-"):  # 实体开始
        if current_entity:  # 如果有正在处理的实体,先存起来
            entities.append((current_entity, current_label))
        current_entity = [token]  # 新建一个实体
        current_label = label[2:]  # 去掉B-,记录实体类别
    elif label.startswith("I-") or label.startswith("E-"):  # 实体内部或结束
        current_entity.append(token)  # 将token添加到当前实体
        if label.startswith("E-"):  # 如果是实体结束,存储实体
            entities.append((current_entity, current_label))
            current_entity = []
            current_label = None
    else:
        if current_entity:  # 处理非实体标签时,存储当前实体
            entities.append((current_entity, current_label))
            current_entity = []
            current_label = None
        entities.append(([token], label))  # 存储非实体token

# 如果最后还有未存储的实体
if current_entity:
    entities.append((current_entity, current_label))

# 合并实体tokens,并修正拆分的关系词
final_entities = []
i = 0
while i < len(entities):
    entity, label = entities[i]
    entity_str = ''.join(entity)

    # 合并"父"和"亲"、"母"和"亲"等关系词
    if i + 1 < len(entities) and ''.join(entity + entities[i + 1][0]) in ["表哥", "姐夫","妻子","堂姐","父亲", "母亲", "哥哥", "姐姐", "弟弟",
                                                                          "妹妹","儿子","女儿"]:
        entity_str = ''.join(entity + entities[i + 1][0])  # 合并
        label = "REL"  # 将合并后的关系词标记为"REL"
        i += 1  # 跳过下一个token
    final_entities.append((entity_str, label))
    i += 1

print("final_entities:", final_entities)

# 定义关系模式
relation_patterns = {
    "表哥": "brother_low",
    "姐夫": "sister_husband",
    "妻子": "wife",
    "堂姐":"cousin",
    "父亲": "father",
    "母亲": "mother",
    "哥哥": "brother",
    "姐姐": "sister",
    "弟弟": "brother",
    "妹妹": "sister",
    "儿子": "son",
    "女儿": "daughter"
}

# 关系抽取
relations = []
i = 0
while i < len(final_entities):
    token, label = final_entities[i]
    #print("token:", token, "label:", label)
    # 尝试匹配单字关系词或拼接多字关系词
    if token in relation_patterns and label == "REL":
        relation = relation_patterns[token]
       # print("relation:", relation)

        # 查找前面的实体,跳过非实体的词
        j = i - 1
        while j >= 0 and final_entities[j][1] != "PERSON":
            j -= 1
        if j >= 0 and final_entities[j][1] == "PERSON":
            entity1 = final_entities[j][0]

        # 查找后面的实体,跳过非实体的词
        k = i + 1
        while k < len(final_entities) and final_entities[k][1] != "PERSON":
            k += 1
        if k < len(final_entities) and final_entities[k][1] == "PERSON":
            entity2 = final_entities[k][0]

        # 记录关系
        if entity1 and entity2:
            relations.append((entity1, relation, entity2))

    i += 1

print("Extracted relations:", relations)

# 连接到 Neo4j 数据库
uri = "bolt://localhost:7687"
user = "neo4j"
password = "swpu@swpu"

graph = Graph(uri, auth=(user, password))
# 清除已有数据(可选)
graph.run("MATCH (n) DETACH DELETE n")

# 将实体和关系存储到 Neo4j
def add_to_neo4j(entity1, relation, entity2):
    node1 = Node("Person", name=entity1)
    node2 = Node("Person", name=entity2)
    rel = Relationship(node1, relation, node2)

    # 合并节点和关系
    graph.merge(node1, "Person", "name")
    graph.merge(node2, "Person", "name")
    graph.create(rel)


for entity1, relation, entity2 in relations:
    add_to_neo4j(entity1, relation, entity2)


原文地址:https://blog.csdn.net/Metal1/article/details/142797705

免责声明:本站文章内容转载自网络资源,如本站内容侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!