Article Outline
TOC
Collection Outline
<center><b><font color=#A52A2A size=5 >公众号:数据挖掘与机器学习笔记</font></b></center>
在之前的文章“论文阅读04:使用序列标注的方式解决实体和关系的联合抽取”介绍了一种使用序列标注的方式来进行实体和关系的联合抽取,模型的具体细节可以查看该文章。今天的文章主要是对这篇文章中论文提到的模型进行简单的实现。论文中提到的偏置目标函数通过给不同的标签赋予不同的权重进行实现。
这里仅实现模型,能够跑通,因为是在个人笔记本上跑,机器性能不够,所以没有训练完,也没有调参。最终的训练效果也未知。感兴趣的同学可以自己调调参。
1.数据处理
import re
import json
import numpy as np
from tensorflow.keras.preprocessing import sequence
from config import *
def get_data(train_path, test_path):
"""
从json中提取数据
:param train_path:
:param test_path:
:return:
"""
train_file = open(train_path).readlines()
x_train = []
y_train = []
for i in train_file:
data = json.loads(i)
x_data, y_data = data_decoding(data)
x_train += x_data
y_train += y_data
test_file = open(test_path).readlines()
x_test = []
y_test = []
for j in test_file:
data = json.loads(j)
x_data, y_data = data_decoding(data)
x_test += x_data
y_test += y_data
return x_train, y_train, x_test, y_test
def data_decoding(data):
'''
decode the json file
sentText is the sentence
each sentence may have multiple types of relations
for every single data, it contains: (sentence-splited, labels)
'''
sentence = data["sentText"]
relations = data["relationMentions"]
x_data = []
y_data = []
for i in relations:
entity_1 = i["em1Text"].split(" ")
entity_2 = i["em2Text"].split(" ")
relation = i["label"]
relation_label_1 = entity_label_construction(entity_1)
relation_label_2 = entity_label_construction(entity_2)
output_list = sentence_label_construction(sentence, relation_label_1, relation_label_2, relation)
x_data.append(sentence.split(" "))
y_data.append(output_list)
return x_data, y_data
def entity_label_construction(entity):
'''
give each word in an entity the label
for entity with multiple words, it should follow the BIES rule
'''
relation_label = {}
for i in range(len(entity)):
if i == 0 and len(entity) >= 1:
relation_label[entity[i]] = "B"
if i != 0 and len(entity) >= 1 and i != len(entity) - 1:
relation_label[entity[i]] = "I"
if i == len(entity) - 1 and len(entity) >= 1:
relation_label[entity[i]] = "E"
if i == 0 and len(entity) == 1:
relation_label[entity[i]] = "S"
return relation_label
def sentence_label_construction(sentence, relation_label_1, relation_label_2, relation):
'''
combine the label for each word in each entity with the relation
and then combine the relation-entity label with the position of the entity in the triple
'''
element_list = sentence.split(" ")
dlist_1 = list(relation_label_1)
dlist_2 = list(relation_label_2)
output_list = []
for i in element_list:
if i in dlist_1:
output_list.append(relation + "-" + relation_label_1[i] + "-1")
elif i in dlist_2:
output_list.append(relation + "-" + relation_label_2[i] + "-1")
else:
output_list.append("O")
return output_list
def format_control(string):
str1 = re.sub(r"\r", "", string)
str2 = re.sub(r"\n", "", str1)
str3 = re.sub(r"\s*", "", str2)
return str3
def get_index(word_dict, tag_dict, x_data, y_data):
x_out = [word_dict[str(k)] for k in x_data]
y_out = [tag_dict.get(str(l), tag_dict["O"]) for l in y_data]
return [x_out, y_out]
def word_tag_dict(word_dict_path, tag_dict_path):
word_dict = {}
f = open(word_dict_path, "r").readlines()
for i, j in enumerate(f):
word = re.sub(r"\n", "", str(j))
word_dict[word] = i + 1
tag_dict = {}
f = open(tag_dict_path, "r").readlines()
for m, n in enumerate(f):
tag = re.sub(r"\n", "", str(n))
tag_dict[tag] = m
return word_dict, tag_dict
class DataGenerator:
def __init__(self, word_dict, tag_dict, x_train, y_train, batch_size, max_len, is_test=False):
self.max_len = max_len
self.word_dict = word_dict
self.tag_dict = tag_dict
self.x_train = x_train
self.y_train = y_train
self.batch_size = batch_size
self.is_test = is_test
self.steps = len(self.x_train) // self.batch_size
if len(self.x_train) % self.batch_size != 0:
self.steps += 1
def __len__(self):
return self.steps
def __iter__(self):
while True:
idxs = list(range(len(self.x_train)))
if not self.is_test:
np.random.shuffle(idxs)
x_data, y_data = [], []
for i in idxs:
x = self.x_train[i]
y = self.y_train[i]
x_out, y_out = get_index(self.word_dict, self.tag_dict, x, y)
x_data.append(x_out)
y_data.append(y_out)
if len(x_data) == self.batch_size or i == idxs[-1]:
x_data = sequence.pad_sequences(x_data, maxlen=self.max_len, padding="post", truncating="post")
y_data = sequence.pad_sequences(y_data, maxlen=self.max_len, padding="post", truncating="post", value=self.tag_dict["O"])
yield np.array(x_data), np.array(y_data)
x_data, y_data = [], []
if __name__ == '__main__':
sentence_train, seq_train, sentence_test, seq_test = get_data(TRAIN_PATH, TEST_PATH)
max_len = max([len(s) for s in sentence_train])
word_dict, tag_dict = word_tag_dict(WORD_DICT, TAG_DICT)
dataGenerator = DataGenerator(word_dict, tag_dict, sentence_train, seq_train, 16, max_len)
for x, y in dataGenerator.__iter__():
print(x.shape, y.shape)
print(x[0])
print(y[0])
break
2. 模型实现
from tensorflow import keras
from tensorflow.keras import layers
from config import MAX_LEN, BATCH_SIZE, LSTM_DECODE, LSTM_ENCODE, WORD_DICT, TAG_DICT, EMBEDDING_SIZE
def bilstm_lstm(word_size, tag_size):
x = layers.Input(shape=MAX_LEN, batch_size=BATCH_SIZE)
embedding_x = layers.Embedding(input_dim=word_size, output_dim=EMBEDDING_SIZE)(x)
bilstm_encode = layers.Bidirectional(layers.LSTM(units=LSTM_ENCODE, return_sequences=True))(embedding_x)
bilstm_decode = layers.LSTM(units=LSTM_DECODE, return_sequences=True)(bilstm_encode)
out = layers.Dense(units=tag_size, activation="softmax")(bilstm_decode)
model = keras.models.Model(x, out)
model.compile(optimizer=keras.optimizers.Adam(), loss=keras.losses.SparseCategoricalCrossentropy(), metrics=["accuracy"])
model.summary()
return model
3.模型训练
from data_helper import get_data, word_tag_dict, DataGenerator
from config import MAX_LEN, TRAIN_PATH, TEST_PATH, WORD_DICT, TAG_DICT, EPOCH_NUM, BATCH_SIZE, model_save_path
from model import bilstm_lstm
def get_class_weight(tag_dict):
"""
不同标签的权重不一样,标签"O"的权重为1,其他的为10
:param tag_dict:
:return:
"""
class_weight = {}
for tag, index in tag_dict.items():
if tag == "O":
class_weight[index] = 1
else:
class_weight[index] = 10
return class_weight
def train_bilstm_lstm():
x_train, y_train, x_test, y_test = get_data(TRAIN_PATH, TEST_PATH)
word_dict, tag_dict = word_tag_dict(WORD_DICT, TAG_DICT)
train_dataGenerator = DataGenerator(word_dict, tag_dict, x_train, y_train, BATCH_SIZE, MAX_LEN)
test_dataGenerator = DataGenerator(word_dict, tag_dict, x_test, y_test, BATCH_SIZE, MAX_LEN)
class_weight = get_class_weight(tag_dict)
print(class_weight)
model = bilstm_lstm(len(word_dict) + 1, len(tag_dict))
model.fit_generator(train_dataGenerator.__iter__(), epochs=EPOCH_NUM, steps_per_epoch=train_dataGenerator.steps,
validation_data=test_dataGenerator, validation_steps=test_dataGenerator.steps, class_weight=class_weight)
model.save_weights(filepath=model_save_path)
if __name__ == '__main__':
train_bilstm_lstm()
所用训练数据较大,未上传至github,需要请私信。
代码:https://github.com/chongzicbo/KG_Tutorial/tree/main/relation_extract/joint_re_bilstm_ntc