参照当Bert遇上Kerashttps://spaces.ac.cn/archives/6736此示例准确率达到95.5%+

https://github.com/CyberZHG/keras-bert/blob/master/README.zh-CN.md

示例实现

# ! -*- coding:utf-8 -*-

import json
import numpy as np
import pandas as pd
from random import choice
from keras_bert import load_trained_model_from_checkpoint, Tokenizer
import codecs maxlen = 100
config_path = 'model/bert_config.json'
checkpoint_path = 'model/bert_model.ckpt'
dict_path = 'model/vocab.txt' token_dict = {} with codecs.open(dict_path, 'r', 'utf8') as reader:
for line in reader:
token = line.strip()
token_dict[token] = len(token_dict) class OurTokenizer(Tokenizer): def __init__(self, token_dict):
super(OurTokenizer, self).__init__(token_dict) def _tokenize(self, text):
R = []
for c in text:
if c in self._token_dict:
R.append(c)
elif self._is_space(c):
R.append('[unused1]') # space类用未经训练的[unused1]表示
else:
R.append('[UNK]') # 剩余的字符是[UNK]
return R tokenizer = OurTokenizer(token_dict) neg = pd.read_excel('neg.xls', header=None)
pos = pd.read_excel('pos.xls', header=None) data = [] for d in neg[0]:
data.append((d, 0)) for d in pos[0]:
data.append((d, 1)) # 按照9:1的比例划分训练集和验证集
random_order = list(range(len(data)))
np.random.shuffle(random_order)
train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0]
valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0] def seq_padding(X, padding=0):
L = [len(x) for x in X]
ML = max(L)
return np.array([
np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X
]) class data_generator: def __init__(self, data, batch_size=32):
self.data = data
self.batch_size = batch_size
self.steps = len(self.data) // self.batch_size
if len(self.data) % self.batch_size != 0:
self.steps += 1 def __len__(self):
return self.steps def __iter__(self):
while True:
idxs = list(range(len(self.data)))
np.random.shuffle(idxs)
X1, X2, Y = [], [], []
for i in idxs:
d = self.data[i]
text = d[0][:maxlen]
x1, x2 = tokenizer.encode(first=text)
y = d[1]
X1.append(x1)
X2.append(x2)
Y.append([y])
if len(X1) == self.batch_size or i == idxs[-1]:
X1 = seq_padding(X1)
X2 = seq_padding(X2)
Y = seq_padding(Y)
yield [X1, X2], Y
[X1, X2, Y] = [], [], [] from keras.layers import *
from keras.models import Model
from keras.optimizers import Adam bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None) for l in bert_model.layers:
l.trainable = False x1_in = Input(shape=(None,))
x2_in = Input(shape=(None,)) x = bert_model([x1_in, x2_in]) x = Lambda(lambda x: x[:, 0])(x)
p = Dense(1, activation='sigmoid')(x) model = Model([x1_in, x2_in], p)
model.compile(
loss='binary_crossentropy',
optimizer=Adam(1e-5), # 用足够小的学习率
metrics=['accuracy']
)
model.summary() train_D = data_generator(train_data)
valid_D = data_generator(valid_data) test = [train_data[0]]
test_D = data_generator(test) model.fit_generator(
train_D.__iter__(),
steps_per_epoch=len(train_D),
epochs=1,
validation_data=valid_D.__iter__(),
validation_steps=len(valid_D)
) #保存模型权重值
model.save('model.h5')

原示例存在的问题

模型在保持完之后再进行加载时提示存在自定义层和激活方法的问题,暂没找到解决办法,如有知道办法的小伙伴请留言私信

问题解决

# ! -*- coding:utf-8 -*-

import json
import numpy as np
import pandas as pd
from random import choice
from keras_bert import load_trained_model_from_checkpoint, Tokenizer, get_custom_objects
import re, os
import codecs
from keras.models import load_model maxlen = 100
config_path = 'model/bert_config.json'
checkpoint_path = 'model/bert_model.ckpt'
dict_path = 'model/vocab.txt' token_dict = {} with codecs.open(dict_path, 'r', 'utf8') as reader:
for line in reader:
token = line.strip()
token_dict[token] = len(token_dict) class OurTokenizer(Tokenizer): def __init__(self, token_dict):
super(OurTokenizer, self).__init__(token_dict) def _tokenize(self, text):
R = []
for c in text:
if c in self._token_dict:
R.append(c)
elif self._is_space(c):
R.append('[unused1]') # space类用未经训练的[unused1]表示
else:
R.append('[UNK]') # 剩余的字符是[UNK]
return R tokenizer = OurTokenizer(token_dict) neg = pd.read_excel('neg.xls', header=None)
pos = pd.read_excel('pos.xls', header=None) data = [] for d in neg[0]:
data.append((d, 0)) for d in pos[0]:
data.append((d, 1)) # 按照9:1的比例划分训练集和验证集
random_order = list(range(len(data)))
np.random.shuffle(random_order)
train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0]
valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0] def seq_padding(X, padding=0):
L = [len(x) for x in X]
ML = max(L)
return np.array([
np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X
]) class data_generator: def __init__(self, data, batch_size=32):
self.data = data
self.batch_size = batch_size
self.steps = len(self.data) // self.batch_size
if len(self.data) % self.batch_size != 0:
self.steps += 1 def __len__(self):
return self.steps def __iter__(self):
while True:
idxs = list(range(len(self.data)))
np.random.shuffle(idxs)
X1, X2, Y = [], [], []
for i in idxs:
d = self.data[i]
text = d[0][:maxlen]
x1, x2 = tokenizer.encode(first=text)
y = d[1]
X1.append(x1)
X2.append(x2)
Y.append([y])
if len(X1) == self.batch_size or i == idxs[-1]:
X1 = seq_padding(X1)
X2 = seq_padding(X2)
Y = seq_padding(Y)
yield [X1, X2], Y
[X1, X2, Y] = [], [], [] from keras.layers import *
from keras.models import Model
import keras.backend as K
from keras.optimizers import Adam bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None) for l in bert_model.layers:
l.trainable = False x1_in = Input(shape=(None,))
x2_in = Input(shape=(None,)) x = bert_model([x1_in, x2_in]) print(bert_model.layers) x = Lambda(lambda x: x[:, 0])(x)
p = Dense(1, activation='sigmoid')(x) model = Model([x1_in, x2_in], p)
model.compile(
loss='binary_crossentropy',
optimizer=Adam(1e-5), # 用足够小的学习率
metrics=['accuracy']
)
model.summary()
train_D = data_generator(train_data)
valid_D = data_generator(valid_data) '''
model.fit_generator(
train_D.__iter__(),
steps_per_epoch=len(train_D),
epochs=5,
validation_data=valid_D.__iter__(),
validation_steps=len(valid_D)
) model.save('save_path.h5')
''' # 定义生成器将数据集解析为
class data_token_generator: def __init__(self, data, batch_size=32):
self.data = data
self.batch_size = batch_size
self.steps = len(self.data) # self.batch_size
if len(self.data) % self.batch_size != 0:
self.steps += 1 def __len__(self):
return self.steps def get_data(self):
idxs = list(range(len(self.data)))
np.random.shuffle(idxs)
X1, X2, Y = [], [], []
for i in idxs:
d = self.data[i]
text = d[0][:maxlen]
print(text)
x1, x2 = tokenizer.encode(first=text)
y = d[1]
X1.append(x1)
X2.append(x2)
Y.append([y])
X1 = seq_padding(X1)
X2 = seq_padding(X2)
Y = seq_padding(Y)
return X1, X2, Y new_model = load_model('save_path.h5', custom_objects=get_custom_objects())
test_T = data_token_generator(valid_data[0:10])
X_test1, X_test2, Y_test = test_T.get_data()
print(Y_test)
print(new_model.predict([X_test1, X_test2]))

我的实现

# ! -*- coding:utf-8 -*-
import numpy as np
import pandas as pd
from random import choice
from keras_bert import load_trained_model_from_checkpoint, Tokenizer, get_checkpoint_paths
import codecs
from keras.layers import *
from keras.models import Model
from keras.optimizers import Adam # 评价文本最大长度
maxlen = 100
dict_path = 'model/vocab.txt'
token_dict = {}
EPOCHS = 30
BATCH_SIZE = 128 # 初始化令牌字典
with codecs.open(dict_path, 'r', 'utf8') as reader:
for line in reader:
token = line.strip()
# print(token, len(token_dict))
token_dict[token] = len(token_dict) # 定义令牌解析器
class OurTokenizer(Tokenizer): def _tokenize(self, text):
R = []
for c in text:
if c in self._token_dict:
R.append(c)
elif self._is_space(c):
R.append('[unused1]') # space类用未经训练的[unused1]表示
else:
R.append('[UNK]') # 剩余的字符是[UNK]
return R # 初始化令牌解析器
tokenizer = OurTokenizer(token_dict) # 读取数据集
neg = pd.read_excel('neg.xls', header=None)
pos = pd.read_excel('pos.xls', header=None) data = [] for d in neg[0]:
data.append((d, 0)) for d in pos[0]:
data.append((d, 1)) # 按照9:1的比例划分训练集和验证集
random_order = list(range(len(data)))
np.random.shuffle(random_order)
train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0]
valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0] # 令牌序列长度补全
def seq_padding(X, padding=0):
L = [len(x) for x in X]
ML = max(L)
t = [
np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X
]
return t # 定义生成器将数据集解析为
class data_token_generator: def __init__(self, data, batch_size=32, print_text=False):
self.data = data
self.batch_size = batch_size
self.steps = len(self.data) # self.batch_size
self.print_text = print_text
if len(self.data) % self.batch_size != 0:
self.steps += 1 # bert中文模型路径
paths = get_checkpoint_paths('model')
# bert中文模型加载
self.bert_model = load_trained_model_from_checkpoint(paths.config, paths.checkpoint, seq_len=None) for l in self.bert_model.layers:
l.trainable = True def __len__(self):
return self.steps def get_data(self):
data_x = []
data_y = []
idxs = list(range(len(self.data)))
# 随机
np.random.shuffle(idxs)
indices, segments, Y = [], [], []
for i in idxs:
d = self.data[i]
# 截取数据
text = d[0][:maxlen]
if self.print_text:
print(text)
# 生成指标及段
indice, segment = tokenizer.encode(first=text)
y = d[1]
# 数据放入数组中
indices.append(indice)
segments.append(segment)
Y.append([y])
# 转化成批次
if len(indices) == self.batch_size or i == idxs[-1]:
indices = seq_padding(indices)
segments = seq_padding(segments)
Y = seq_padding(Y)
# 产生词向量
x = self.bert_model.predict([np.array(indices), np.array(segments)]) j_idxs = list(range(len(x)))
for j in j_idxs:
data_x.append(x[j])
data_y.append(Y[j]) print(len(data_y))
[indices, segments, Y] = [], [], [] return np.array(data_x), np.array(data_y) # 定义二分类网络
x_in = Input(shape=(None, 768))
x = Lambda(lambda x: x[:, 0])(x_in)
p = Dense(1, activation='sigmoid')(x) model = Model(x_in, p)
model.compile(
loss='binary_crossentropy',
optimizer=Adam(1e-5), # 用足够小的学习率
metrics=['accuracy']
)
# 打印模型结构
model.summary() # 开始训练
print('Training -----------') train_T = data_token_generator(train_data)
train_x, train_y = train_T.get_data()
valid_T = data_token_generator(valid_data)
validation_data = valid_T.get_data()
model.fit(
train_x,
train_y,
epochs=EPOCHS,
batch_size=BATCH_SIZE,
validation_data=validation_data
) model.save('new_model.h5') # 加载模型验证
import keras test_T = data_token_generator(valid_data[0:10], print_text=True)
X_test, Y_test = test_T.get_data()
print(Y_test)
new_model = keras.models.load_model('new_model.h5')
y = new_model.predict(X_test)
print(y)

采用哈工大版权重,准确率在80%左右

相关依赖

中文版权重

NLP采用Bert进行简单文本情感分类的更多相关文章

  1. NLP之基于TextCNN的文本情感分类

    TextCNN @ 目录 TextCNN 1.理论 1.1 基础概念 最大汇聚(池化)层: 1.2 textCNN模型结构 2.实验 2.1 实验步骤 2.2 算法模型 1.理论 1.1 基础概念 在 ...

  2. 基于Bert的文本情感分类

    详细代码已上传到github: click me Abstract:    Sentiment classification is the process of analyzing and reaso ...

  3. NLP文本情感分类传统模型+深度学习(demo)

    文本情感分类: 文本情感分类(一):传统模型 摘自:http://spaces.ac.cn/index.php/archives/3360/ 测试句子:工信处女干事每月经过下属科室都要亲口交代24口交 ...

  4. NLP之基于Bi-LSTM和注意力机制的文本情感分类

    Bi-LSTM(Attention) @ 目录 Bi-LSTM(Attention) 1.理论 1.1 文本分类和预测(翻译) 1.2 注意力模型 1.2.1 Attention模型 1.2.2 Bi ...

  5. 文本情感分类:分词 OR 不分词(3)

    为什么要用深度学习模型?除了它更高精度等原因之外,还有一个重要原因,那就是它是目前唯一的能够实现“端到端”的模型.所谓“端到端”,就是能够直接将原始数据和标签输入,然后让模型自己完成一切过程——包括特 ...

  6. pytorch 文本情感分类和命名实体识别NER中LSTM输出的区别

    文本情感分类: 文本情感分类采用LSTM的最后一层输出 比如双层的LSTM,使用正向的最后一层和反向的最后一层进行拼接 def forward(self,input): ''' :param inpu ...

  7. kaggle之电影评论文本情感分类

    电影文本情感分类 Github地址 Kaggle地址 这个任务主要是对电影评论文本进行情感分类,主要分为正面评论和负面评论,所以是一个二分类问题,二分类模型我们可以选取一些常见的模型比如贝叶斯.逻辑回 ...

  8. NLP(二十二)利用ALBERT实现文本二分类

      在文章NLP(二十)利用BERT实现文本二分类中,笔者介绍了如何使用BERT来实现文本二分类功能,以判别是否属于出访类事件为例子.但是呢,利用BERT在做模型预测的时候存在预测时间较长的问题.因此 ...

  9. 基于 Spark 的文本情感分析

    转载自:https://www.ibm.com/developerworks/cn/cognitive/library/cc-1606-spark-seniment-analysis/index.ht ...

随机推荐

  1. LC 889. Construct Binary Tree from Preorder and Postorder Traversal

    Return any binary tree that matches the given preorder and postorder traversals. Values in the trave ...

  2. 2019-8-12未命名文件 sdfsf

    2019-8-12未命名文件 sdfsf 新建模板小书匠 欢迎使用 小书匠(xiaoshujiang)编辑器,您可以通过 小书匠主按钮>模板 里的模板管理来改变新建文章的内容.sdfsdfsdf

  3. sql中对查询出的某个字段转换查询

    <select id="queryCmonByLanId" parameterType="java.util.Map" resultType=" ...

  4. HideTcpip.c

    隐藏tcp端口,来自看雪 /////////////////////////////////////////////////////////////////////////////////////// ...

  5. wmic查询ip

    @echo off for /F "usebackq" %%R in (`wmic PATH Win32_NetworkAdapterConfiguration WHERE &qu ...

  6. Xshell 6 安装注册说明

    1.下载 官方下载到的版本在进行安装的时候,是有序列号输入框的如下图,但是好像我们从那个所谓的官网上下载的却没有,只能下载到一点击注册就跳转到一个网站的那种版本. 像上面这种版本如何下载呢? 我们去官 ...

  7. 《Linux命令行大全》 笔记记录

    1.Shell是什么 2.(文件目录)导航 3.Linux系统 4.操作文件和目录 5.命令的使用 6.重定向 7.透过shell看世界 扩展 引用 8.高级键盘技巧 9.权限 10.进程 11.环境 ...

  8. 编译安装MySQL数据库

    1. 安cmake工具 # yum install -y cmake 2. 创建mysql用户 # useradd -M -s /sbin/nologin mysql 3. 创建数据目录 # mkdi ...

  9. JS延迟加载的几种方式

    参考链接:https://blog.csdn.net/meijory/article/details/76389762

  10. aws技术链接

    S3: https://docs.aws.amazon.com/s3/?id=docs_gateway emrfs: https://docs.aws.amazon.com/emr/latest/Ma ...