duee信息提取篇章级数据预处理
import os
import sys
import json
def read_by_lines(path):
result = list()
with open(path, "r", encoding="utf8") as infile:
for line in infile:
result.append(line.strip())
return result
def write_by_lines(path, data):
with open(path, "w", encoding="utf8") as outfile:
[outfile.write(d + "\n") for d in data]
def text_to_sents(text):
# 包含中文句子分隔符的 Unicode 字符串列表。这些分隔符用于在中文文本中识别和分割句子
delimiter_symbols = [u"。", u"?", u"!"]#。?!
paragraphs = text.split("\n")#按换行符拆分文本
ret = []
for para in paragraphs:#para:指其中的每个按行分割的一个个文本段
if para.strip() == u"":
continue
sents = [u""]
for s in para:#遍历文本段中的每个字符
# print(s)#每个字符
sents[-1] += s
# print(s)
# print(len(sents))
if s in delimiter_symbols :#如果够一句话了,就为这个文本段新开始个,表示一个文本段中第二个句子
sents.append(u"")
# print(len(sents))
# print('--------------------')
if sents[-1] == u"":#如果这个文本段以空字符串结尾
sents = sents[:-1]
ret.extend(sents)#
return ret
a=text_to_sents('这些分隔符用于在中文文本中识别和分割句子。mmm')
import hashlib
def calculate_md5(input_str):
md5_hash = hashlib.md5(input_str.encode('utf-8')).hexdigest()
return md5_hash
input_string = "hello world"
print(calculate_md5(input_string))
conf_dir = "./conf/DuEE-Fin"
if not os.path.exists(conf_dir):
os.makedirs(conf_dir)
schema_path = "./datasets/DuEE-fin/duee_fin_event_schema.json"
tags_trigger_path = "{}/trigger_tag.dict".format(conf_dir)#保存trigger标签
tags_role_path = "{}/role_tag.dict".format(conf_dir)#保存role标签
tags_enum_path = "{}/enum_tag.dict".format(conf_dir)#保存枚举
def label_add(labels, _type):
if "B-{}".format(_type) not in labels:#没在列表里面就追加
labels.extend(["B-{}".format(_type), "I-{}".format(_type)])
return labels
# trigger,触发词
schema_l=read_by_lines(schema_path)# schema
labels = []
for line in schema_l:
d_json = json.loads(line.strip())
labels = label_add(labels, d_json["event_type"])
labels.append("O")
tags_trigger = []#
for index, label in enumerate(labels):#保存在列表,为了写到文件方便
tags_trigger.append("{}\t{}".format(index, label))
write_by_lines(tags_trigger_path, tags_trigger)
enum_role = "环节"
labels = []
for line in schema_l:
d_json = json.loads(line.strip())
for role in d_json["role_list"]:
if role["role"] == enum_role:#
continue
labels = label_add(labels, role["role"])
labels.append("O")
tags_roles = []#
for index, label in enumerate(labels):#里面元素是字典形式
tags_roles.append("{}\t{}".format(index, label))
write_by_lines(tags_role_path, tags_roles)
enum_role = "环节"
labels = []
for line in schema_l:
d_json = json.loads(line.strip())
for role in d_json["role_list"]:
if role["role"] == enum_role:
labels = role["enum_items"]
tags_enums = []
for index, label in enumerate(labels):#里面元素是映射对的样式
tags_enums .append("{}\t{}".format(index, label))
write_by_lines(tags_enum_path, tags_enums)
# data process
data_dir = "./datasets/DuEE-Fin"
sentence_dir = "{}/sentence".format(data_dir)
trigger_save_dir = "{}/trigger".format(data_dir)
role_save_dir = "{}/role".format(data_dir)
enum_save_dir = "{}/enum".format(data_dir)
if not os.path.exists(sentence_dir):
os.makedirs(sentence_dir)
x_train = read_by_lines( "./datasets/DuEE-fin/duee_fin_train.json")
def argument_in_sent(sent, argument_list, trigger):
trigger_start = sent.find(
原文地址:https://blog.csdn.net/LIjin_1006/article/details/140221890
免责声明:本站文章内容转载自网络资源,如本站内容侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!