先运行main.py进行文本序列化,再train.py模型训练

dataset.py

from torch.utils.data import DataLoader,Dataset
import torch
import os
from utils import tokenlize
import config class ImdbDataset(Dataset):
def __init__(self,train=True):
super(ImdbDataset,self).__init__()
data_path = r"H:\073-nlp自然语言处理-v5.bt38[周大伟]\073-nlp自然语言处理-v5.bt38[周大伟]\第四天\代码\data\aclImdb_v1\aclImdb"
data_path += r"\train" if train else r"\test"
self.total_path = []
for temp_path in [r"\pos",r"\neg"]:
cur_path = data_path + temp_path
self.total_path += [os.path.join(cur_path,i) for i in os.listdir(cur_path) if i.endswith(".txt")] def __getitem__(self, idx):
file = self.total_path[idx]
review = open(file,encoding="utf-8").read()
review = tokenlize(review)
label = int(file.split("_")[-1].split(".")[0])
label = 0 if label < 5 else 1
return review,label def __len__(self):
return len(self.total_path) def collate_fn(batch):
'''
对batch数据进行处理
:param batch:
:return:
'''
reviews,labels = zip(*batch)
reviews = torch.LongTensor([config.ws.transform(i,max_len=config.max_len) for i in reviews])
labels = torch.LongTensor(labels)
return reviews,labels def get_dataloader(train):
imdbdataset = ImdbDataset(train=True)
batch_size = config.train_batch_size if train else config.test_batch_size
return DataLoader(imdbdataset,batch_size=batch_size,shuffle=True,collate_fn=collate_fn) if __name__ == '__main__':
# dataset = ImdbDataset(train=True)
# print(dataset[1])
for idx,(review,label) in enumerate(get_dataloader(train=True)):
print(review)
print(label)
break

  utils.py

"""
实现额外的方法
"""
import re def tokenlize(sentence):
"""
进行文本分词
:param sentence: str
:return: [str,str,str]
""" fileters = ['!', '"', '#', '$', '%', '&', '\(', '\)', '\*', '\+', ',', '-', '\.', '/', ':', ';', '<', '=', '>',
'\?', '@', '\[', '\\', '\]', '^', '_', '`', '\{', '\|', '\}', '~', '\t', '\n', '\x97', '\x96', '”', '“', ]
sentence = sentence.lower() #把大写转化为小写
sentence = re.sub("<br />"," ",sentence)
# sentence = re.sub("I'm","I am",sentence)
# sentence = re.sub("isn't","is not",sentence)
sentence = re.sub("|".join(fileters)," ",sentence)
result = [i for i in sentence.split(" ") if len(i)>0] return result

word_sequence.py

'''
文本序列化
''' class WordSequence():
UNK_TAG = "<UNK>"
PAD_TAG = "<PAD>"
UNK = 1
PAD = 0 def __init__(self):
self.dict = {
self.UNK_TAG:self.UNK,
self.PAD_TAG:self.PAD
}
self.count = {} def fit(self,sentence):
'''
统计词频
:param sentence:
:return:
'''
for word in sentence:
self.count[word] = self.count.get(word,0)+1 def build_vocab(self,min_count=0,max_count = None,max_features = None):
"""
根据条件构建 词典
:param min_count:最小词频
:param max_count: 最大词频
:param max_features: 最大词语数
:return:
"""
if min_count is not None:
self.count = {word:count for word,count in self.count.items() if count >min_count}
if max_count is not None:
self.count = {word:count for word,count in self.count.items() if count<max_count}
if max_features is not None:
#排序
self.count = dict(sorted(self.count.items(),lambda x:x[-1],reverse=True)[:max_features]) for word in self.count:
self.dict[word] = len(self.dict) #每次word对应一个数字 #把dict进行翻转
self.inverse_dict = dict(zip(self.dict.values(),self.dict.keys())) def transform(self,sentence,max_len =None):
'''
把句子转化为数字序列
:param sentence:
:return:
'''
if len(sentence) > max_len:
sentence = sentence[:max_len]
else:
sentence = sentence + [self.PAD_TAG]*(max_len-len(sentence))
return [self.dict.get(i,1) for i in sentence] def inverse_transform(self,incides):
"""
把数字序列转化为字符
:param incides:
:return:
"""
return [self.inverse_dict.get(i,"<UNK>") for i in incides] def __len__(self):
return len(self.dict) if __name__ == '__main__':
sentences = [["今天","天气","很","好"],
["今天","去","吃","什么"]] ws = WordSequence()
for sentence in sentences:
ws.fit(sentence) ws.build_vocab(min_count=0)
print(ws.dict)
ret = ws.transform(["好","热","呀","呀","呀","呀","呀","呀","呀"],max_len=5)
print(ret)
ret = ws.inverse_transform(ret)
print(ret)

  main.py

from word_sequence import WordSequence
from dataset import get_dataloader
import pickle
from tqdm import tqdm if __name__ == '__main__':
ws = WordSequence()
train_data = get_dataloader(True)
test_data = get_dataloader(False)
for reviews,labels in tqdm(train_data,total=len(train_data)):
for review in reviews:
ws.fit(review)
for reviews,labels in tqdm(test_data,total=len(test_data)):
for review in reviews:
ws.fit(review)
print("正在建立...")
ws.build_vocab()
print(len(ws))
pickle.dump(ws,open("./models/ws.pkl","wb"))

  model.py

"""
构建模型
"""
import torch.nn as nn
import config
import torch.nn.functional as F class ImdbModel(nn.Module):
def __init__(self):
super(ImdbModel,self).__init__()
self.embedding = nn.Embedding(num_embeddings=len(config.ws),embedding_dim=300,padding_idx=config.ws.PAD)
self.fc = nn.Linear(config.max_len*300,2) def forward(self,input):
'''
:param input:
:return:
'''
input_embeded = self.embedding(input) input_embeded_viewed = input_embeded.view(input_embeded.size(0),-1) out = self.fc(input_embeded_viewed)
return F.log_softmax(out,dim=-1)

  LSTMmodel.py

"""
构建模型
"""
import torch.nn as nn
import torch
import config
import torch.nn.functional as F class ImdbModel(nn.Module):
def __init__(self):
super(ImdbModel,self).__init__()
self.embedding = nn.Embedding(num_embeddings=len(config.ws),embedding_dim=300,padding_idx=config.ws.PAD)
self.lstm = nn.LSTM(input_size=200,hidden_size=64,num_layers=2,batch_first=True,bidirectional=True,dropout=0.5)
self.fc1 = nn.Linear(64*2,64)
self.fc2 = nn.Linear(64,2) def forward(self,input):
'''
:param input:
:return:
'''
input_embeded = self.embedding(input) #[batch_size,seq_len,200] output,(h_n,c_n) = self.lstm(input_embeded)
out = torch.cat(h_n[-1,:,:],h_n[-2,:,:],dim=-1) #拼接正向最后一个输出和反向最后一个输出 #进行全连接
out_fc1 = self.fc1(out)
#进行relu
out_fc1_relu = F.relu(out_fc1)
#全连接
out = self.fc2(out_fc1_relu)
return F.log_softmax(out,dim=-1)

  train.py

'''
进行模型的训练
'''
import torch import config
from model import ImdbModel
from dataset import get_dataloader
from torch.optim import Adam
from tqdm import tqdm
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from eval import eval model = ImdbModel().to(config.device)
optimizer = Adam(model.parameters(),lr=0.001)
loss_list = [] def train(epoch):
train_dataloader = get_dataloader(train=True)
bar = tqdm(train_dataloader,total=len(train_dataloader)) for idx,(input,target) in enumerate(bar):
optimizer.zero_grad()
input = input.to(config.device)
target = target.to(config.device)
output = model(input)
loss = F.nll_loss(output,target)
loss.backward()
loss_list.append(loss.item())
optimizer.step()
bar.set_description("epoch:{} idx:{} loss:{:.6f}".format(epoch,idx,np.mean(loss_list))) if idx%10 == 0:
torch.save(model.state_dict(),"./models/model.pkl")
torch.save(optimizer.state_dict(),"./models/optimizer.pkl") if __name__ == '__main__':
for i in range(5):
train(i)
eval()
plt.figure(figsize=(20,8))
plt.plot(range(len(loss_list)),loss_list)

  eval.py

'''
进行模型的训练
'''
import torch import config
from model import ImdbModel
from dataset import get_dataloader
from torch.optim import Adam
from tqdm import tqdm
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt def eval():
model = ImdbModel().to(config.device)
model.load_state_dict(torch.load("./models/model.pkl"))
model.eval()
loss_list = []
acc_list = []
test_dataloader = get_dataloader(train=False)
with torch.no_grad():
for input,target in test_dataloader:
input = input.to(config.device)
target = target.to(config.device)
output = model(input)
loss = F.nll_loss(output,target)
loss_list.append(loss.item())
#准确率
pred= output.max(dim = -1)[-1]
acc_list.append(pred.eq(target).cpu().float().mean())
print("loss:{:.6f},acc:{}".format(np.mean(loss_list),np.mean(acc_list))) if __name__ == '__main__':
eval()

  

pytorch LSTM情感分类全部代码的更多相关文章

  1. pytorch 文本情感分类和命名实体识别NER中LSTM输出的区别

    文本情感分类: 文本情感分类采用LSTM的最后一层输出 比如双层的LSTM,使用正向的最后一层和反向的最后一层进行拼接 def forward(self,input): ''' :param inpu ...

  2. NLP(十九) 双向LSTM情感分类模型

    使用IMDB情绪数据来比较CNN和RNN两种方法,预处理与上节相同 from __future__ import print_function import numpy as np import pa ...

  3. PaddlePaddle︱开发文档中学习情感分类(CNN、LSTM、双向LSTM)、语义角色标注

    PaddlePaddle出教程啦,教程一部分写的很详细,值得学习. 一期涉及新手入门.识别数字.图像分类.词向量.情感分析.语义角色标注.机器翻译.个性化推荐. 二期会有更多的图像内容. 随便,帮国产 ...

  4. 使用BERT进行情感分类预测及代码实例

    文章目录 0. BERT介绍 1. BERT配置 1.1. clone BERT 代码 1.2. 数据处理 1.2.1预训练模型 1.2.2数据集 训练集 测试集 开发集 2. 修改代码 2.1 加入 ...

  5. 基于Bert的文本情感分类

    详细代码已上传到github: click me Abstract:    Sentiment classification is the process of analyzing and reaso ...

  6. kaggle——Bag of Words Meets Bags of Popcorn(IMDB电影评论情感分类实践)

    kaggle链接:https://www.kaggle.com/c/word2vec-nlp-tutorial/overview 简介:给出 50,000 IMDB movie reviews,进行0 ...

  7. 文本情感分类:分词 OR 不分词(3)

    为什么要用深度学习模型?除了它更高精度等原因之外,还有一个重要原因,那就是它是目前唯一的能够实现“端到端”的模型.所谓“端到端”,就是能够直接将原始数据和标签输入,然后让模型自己完成一切过程——包括特 ...

  8. 使用bert进行情感分类

    2018年google推出了bert模型,这个模型的性能要远超于以前所使用的模型,总的来说就是很牛.但是训练bert模型是异常昂贵的,对于一般人来说并不需要自己单独训练bert,只需要加载预训练模型, ...

  9. NLP文本情感分类传统模型+深度学习(demo)

    文本情感分类: 文本情感分类(一):传统模型 摘自:http://spaces.ac.cn/index.php/archives/3360/ 测试句子:工信处女干事每月经过下属科室都要亲口交代24口交 ...

随机推荐

  1. Centos 8 上安装 Consul

    /* 1. 下载二进制安装文件 */下载地址:https://www.consul.io/downloads.html /* 2. 解压缩安装包 */unzip consul_1.6.2_linux_ ...

  2. iOS 启动时间优化

    在 WWDC 2016 上首次提到了关于 App 应用启动速度优化的话题:Session 406 Optimizing App Startup Time. 一.冷启动与热启动 热启动是,APP会恢复之 ...

  3. Ubuntu查看文件格式(后缀名)

    在文件目录执行: $ file filename #filename表示要查看的文件名

  4. Ubuntu系统在Anaconda中安装Python3.6的虚拟环境

    原因:Anaconda的python版本是3.7的,TensorFlow尚不支持此版本,于是我们创建一个Python的虚拟环境以支持TensorFlow 创建tf环境 conda create --n ...

  5. 数据分析_numpy_基础2

    数据分析_numpy_基础2 sqrt 开方 arr = np.arange(10) arr # array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) np.sqrt(arr) ...

  6. 如何理解EventLoop--浏览器篇

    前言 最近在准备春招,刷到了JS中的主要运行机制--Event Loop,觉得它的实现思路有必要整理一下,以防忘记.关于它在浏览器上的实现,我结合了自己的理解以及示例代码,想用最通俗的语言表达出来.如 ...

  7. 【学习笔记】CART算法

    1. 背景介绍 CART(Classification and Regression Trees,分类回归树)算法是一种树构建算法,既可以用于分类,也可以用于回归.它的工作原理是:使用二元切分来处理连 ...

  8. 前端经典面试题解密:JS的new关键字都干了什么?

    前言 new关键字在实例化获取对象时都做了什么?是一道经常出现在前端面试时的问题.如果只是简单的了解new关键字是实例化构造函数获取对象,是万万不能够的.更深入的层级发生了什么呢?同时面试官想从这道题 ...

  9. POJ 3273Monthly Expense(二分答案)

    题目链接 思路如下 题意:这一题让我们在一个 n 个数的序列,分成连续的的 m个子串(一个数也可是一个子串),是在所有子串中 和最大的子串 的和最小. 思路:我们可以用 二分法 来一个一个枚举答案,二 ...

  10. Python 【基础面试题】

    前言 面试题仅做学习参考,学习者阅后也要用心钻研其中的原理,重要知识需要系统学习.透彻学习,形成自己的知识链.以下五点建议希望对您有帮助,早日拿到一份心仪的offer. 做好细节工作,细致的人运气不会 ...