QAnet Encoder
#!/usr/bin/python3
# -*- coding: utf-8 -*-
'''
date: 2019/8/19
mail: cally.maxiong@gmail.com
blog: http://www.cnblogs.com/callyblog/
'''
import math
import tensorflow as tf __all__ = ['encoder'] initializer_relu = lambda: tf.contrib.layers.variance_scaling_initializer(factor=2.0,
mode='FAN_IN',
uniform=False,
dtype=tf.float32)
regularizer = tf.contrib.layers.l2_regularizer(scale=3e-7) def encoder(inputs, num_blocks, num_conv_layers, kernel_size, inputs_mask, num_filters=128, input_projection=False,
num_heads=8, is_training=False, reuse=None, dropout=0.0, scope="res_block"):
"""
QAnet encoder
:param inputs: inputs
:param num_blocks: number of conv and self attention block
:param num_conv_layers: number of layers of each conv block
:param kernel_size: kernel size
:param inputs_mask: input mask
:param num_filters: number of conv filters
:param input_projection: whether add linear before through conv and self attention block
:param num_heads: self attention number of heads
:param is_training: whether training
:param reuse: whether reuse variable
:param dropout: dropout rate
:param scope: scope name
"""
with tf.variable_scope(scope, reuse=reuse):
if input_projection:
inputs = tf.layers.conv1d(inputs, filters=num_filters, kernel_size=1, use_bias=False, reuse=reuse, name='input_projection') outputs = inputs for i in range(num_blocks):
outputs = _add_timing_signal_1d(outputs)
outputs = _conv_block(outputs, num_conv_layers, kernel_size, num_filters, reuse=reuse, is_training=is_training,
dropout=dropout, scope="conv_block%d" % i) outputs = _multihead_attention(outputs, inputs_mask, dropout_rate=dropout, num_heads=num_heads,
training=is_training, reuse=reuse, scope="self_attention_layers%d" % i)
return outputs def _add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
"""Adds a bunch of sinusoids of different frequencies to a Tensor.
Each channel of the input Tensor is incremented by a sinusoid of a different
frequency and phase.
This allows attention to learn to use absolute and relative positions.
Timing signals should be added to some precursors of both the query and the
memory inputs to attention.
The use of relative position is possible because sin(x+y) and cos(x+y) can be
experessed in terms of y, sin(x) and cos(x).
In particular, we use a geometric sequence of timescales starting with
min_timescale and ending with max_timescale. The number of different
timescales is equal to channels / 2. For each timescale, we
generate the two sinusoidal signals sin(timestep/timescale) and
cos(timestep/timescale). All of these sinusoids are concatenated in
the channels dimension.
Args:
x: a Tensor with shape [batch, length, channels]
min_timescale: a float
max_timescale: a float
Returns:
a Tensor the same shape as x.
"""
length = tf.shape(x)[1]
channels = tf.shape(x)[2]
signal = _get_timing_signal_1d(length, channels, min_timescale, max_timescale)
return x + signal def _get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
"""Gets a bunch of sinusoids of different frequencies.
Each channel of the input Tensor is incremented by a sinusoid of a different
frequency and phase.
This allows attention to learn to use absolute and relative positions.
Timing signals should be added to some precursors of both the query and the
memory inputs to attention.
The use of relative position is possible because sin(x+y) and cos(x+y) can be
experessed in terms of y, sin(x) and cos(x).
In particular, we use a geometric sequence of timescales starting with
min_timescale and ending with max_timescale. The number of different
timescales is equal to channels / 2. For each timescale, we
generate the two sinusoidal signals sin(timestep/timescale) and
cos(timestep/timescale). All of these sinusoids are concatenated in
the channels dimension.
Args:
length: scalar, length of timing signal sequence.
channels: scalar, size of timing embeddings to create. The number of
different timescales is equal to channels / 2.
min_timescale: a float
max_timescale: a float
Returns:
a Tensor of timing signals [1, length, channels]
"""
position = tf.to_float(tf.range(length))
num_timescales = channels // 2
log_timescale_increment = (
math.log(float(max_timescale) / float(min_timescale)) /
(tf.to_float(num_timescales) - 1))
inv_timescales = min_timescale * tf.exp(
tf.to_float(tf.range(num_timescales)) * -log_timescale_increment)
scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0)
signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
signal = tf.pad(signal, [[0, 0], [0, tf.mod(channels, 2)]])
signal = tf.reshape(signal, [1, length, channels])
return signal def _conv_block(inputs, num_conv_layers, kernel_size, num_filters, scope="conv_block", is_training=False, reuse=None,
dropout=0.0):
"""
conv block, contain depth wise separable convolution and conv block
:param inputs: inputs
:param num_conv_layers: number of conv layers
:param kernel_size: conv kernel size
:param num_filters: number of conv filters
:param scope: scope name
:param is_training: whether training
:param reuse: whether reuse variable
:param dropout: dropout rate
"""
with tf.variable_scope(scope, reuse=reuse):
outputs = tf.expand_dims(inputs, 2) for i in range(num_conv_layers):
residual = outputs
outputs = _ln(outputs, scope="layer_norm_%d" % i, reuse=reuse) if i % 2 == 0 and is_training:
outputs = tf.layers.dropout(outputs, dropout, training=is_training) outputs = _depthwise_separable_convolution(outputs, kernel_size=(kernel_size, 1), num_filters=num_filters,
scope="depthwise_conv_layers_%d" % i, reuse=reuse) outputs = tf.layers.dropout(outputs, dropout, training=is_training)
outputs = outputs + residual return tf.squeeze(outputs, 2) def _depthwise_separable_convolution(inputs, kernel_size, num_filters, bias=True, reuse=None,
scope="depthwise_separable_convolution"):
"""
depth wise separable convolution
:param inputs: input
:param kernel_size: kernel size
:param num_filters: number of filter
:param bias: whether use bias
:param reuse: whether reuse variable
:param scope: scope name
"""
with tf.variable_scope(scope, reuse=reuse):
shapes = inputs.shape.as_list()
depthwise_filter = tf.get_variable("depthwise_filter",
(kernel_size[0], kernel_size[1], shapes[-1], 1),
dtype=tf.float32,
regularizer=regularizer,
initializer=initializer_relu())
pointwise_filter = tf.get_variable("pointwise_filter",
(1, 1, shapes[-1], num_filters),
dtype=tf.float32,
regularizer=regularizer,
initializer=initializer_relu())
outputs = tf.nn.separable_conv2d(inputs,
depthwise_filter,
pointwise_filter,
strides=(1, 1, 1, 1),
padding="SAME") if bias:
b = tf.get_variable("bias",
outputs.shape[-1],
regularizer=regularizer,
initializer=tf.zeros_initializer())
outputs += b
outputs = tf.nn.relu(outputs)
return outputs def _multihead_attention(inputs,
input_mask,
num_heads=8,
dropout_rate=0.0,
training=False,
reuse=None,
scope="multihead_attention"):
'''Applies multihead attention. See 3.2.2
inputs: A 3d tensor with shape of [N, T, d_model].
input_mask: A 3d tensor with shape of [N, T].
num_heads: An int. Number of heads.
dropout_rate: A floating point number.
training: Boolean. Controller of mechanism for dropout.
causality: Boolean. If true, units that reference the future are masked.
scope: Optional scope for `variable_scope`. Returns
A 3d tensor with shape of (N, T_q, C)
''' with tf.variable_scope(scope, reuse=reuse):
inputs = inputs * tf.cast(tf.expand_dims(input_mask, axis=-1), dtype=tf.float32)
inputs = _ln(inputs, reuse=reuse, scope=scope+'_layer_normal') queries = inputs
keys = inputs
values = inputs d_model = queries.get_shape().as_list()[-1]
# Linear projections
Q = tf.layers.dense(queries, d_model, use_bias=False, reuse=reuse) # (N, T_q, d_model)
K = tf.layers.dense(keys, d_model, use_bias=False, reuse=reuse) # (N, T_k, d_model)
V = tf.layers.dense(values, d_model, use_bias=False, reuse=reuse) # (N, T_k, d_model) # Split and concat
Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, d_model/h)
K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, d_model/h)
V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, d_model/h) # Attention
outputs = _scaled_dot_product_attention(Q_, K_, V_, dropout_rate, training, reuse=reuse) # Restore shape
outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2) # (N, T_q, d_model) # feed forward
outputs = tf.layers.conv1d(outputs, filters=d_model, kernel_size=1, reuse=reuse, trainable=training)
outputs = tf.layers.dropout(outputs, dropout_rate, training=training) # Residual connection
outputs = queries + outputs # Normalize
outputs = _ln(outputs, reuse=reuse, scope='feed_forword_layer_normal') return outputs def _scaled_dot_product_attention(Q, K, V,
dropout_rate=0.,
training=False,
reuse=None,
scope="scaled_dot_product_attention"):
'''See 3.2.1.
Q: Packed queries. 3d tensor. [N, T_q, d_k].
K: Packed keys. 3d tensor. [N, T_k, d_k].
V: Packed values. 3d tensor. [N, T_k, d_v].
causality: If True, applies masking for future blinding
dropout_rate: A floating point number of [0, 1].
training: boolean for controlling droput
scope: Optional scope for `variable_scope`.
'''
with tf.variable_scope(scope, reuse=reuse):
d_k = Q.get_shape().as_list()[-1] # dot product
outputs = tf.matmul(Q, tf.transpose(K, [0, 2, 1])) # (N, T_q, T_k) # scale
outputs /= d_k ** 0.5 # key masking, delete key 0
outputs = _mask(outputs, Q, K, type="key") # softmax
outputs = tf.nn.softmax(outputs)
attention = tf.transpose(outputs, [0, 2, 1])
tf.summary.image("attention", tf.expand_dims(attention[:1], -1)) # query masking, delete query <pad>
outputs = _mask(outputs, Q, K, type="query") # dropout
outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=training) # weighted sum (context vectors)
outputs = tf.matmul(outputs, V) # (N, T_q, d_v) return outputs def _mask(inputs, queries=None, keys=None, type=None):
"""Masks paddings on keys or queries to inputs
inputs: 3d tensor. (N, T_q, T_k)
queries: 3d tensor. (N, T_q, d)
keys: 3d tensor. (N, T_k, d) e.g.,
>> queries = tf.constant([[[1.],
[2.],
[0.]]], tf.float32) # (1, 3, 1)
>> keys = tf.constant([[[4.],
[0.]]], tf.float32) # (1, 2, 1)
>> inputs = tf.constant([[[4., 0.],
[8., 0.],
[0., 0.]]], tf.float32)
>> mask(inputs, queries, keys, "key")
array([[[ 4.0000000e+00, -4.2949673e+09],
[ 8.0000000e+00, -4.2949673e+09],
[ 0.0000000e+00, -4.2949673e+09]]], dtype=float32)
>> inputs = tf.constant([[[1., 0.],
[1., 0.],
[1., 0.]]], tf.float32)
>> mask(inputs, queries, keys, "query")
array([[[1., 0.],
[1., 0.],
[0., 0.]]], dtype=float32)
"""
outputs = None
padding_num = -2 ** 32 + 1
if type in ("k", "key", "keys"):
# Generate masks
masks = tf.sign(tf.reduce_sum(tf.abs(keys), axis=-1)) # (N, T_k)
masks = tf.expand_dims(masks, 1) # (N, 1, T_k)
masks = tf.tile(masks, [1, tf.shape(queries)[1], 1]) # (N, T_q, T_k) # Apply masks to inputs
paddings = tf.ones_like(inputs) * padding_num outputs = tf.where(tf.equal(masks, 0), paddings, inputs) # (N, T_q, T_k)
elif type in ("q", "query", "queries"):
# Generate masks
masks = tf.sign(tf.reduce_sum(tf.abs(queries), axis=-1)) # (N, T_q)
masks = tf.expand_dims(masks, -1) # (N, T_q, 1)
masks = tf.tile(masks, [1, 1, tf.shape(keys)[1]]) # (N, T_q, T_k) # Apply masks to inputs
outputs = inputs*masks
else:
print("Check if you entered type correctly!") return outputs def _ln(inputs, epsilon=1e-6, reuse=None, scope="ln"):
'''Applies layer normalization. See https://arxiv.org/abs/1607.06450.
inputs: A tensor with 2 or more dimensions, where the first dimension has `batch_size`.
epsilon: A floating number. A very small number for preventing ZeroDivision Error.
scope: Optional scope for `variable_scope`. Returns:
A tensor with the same shape and data dtype as `inputs`.
'''
with tf.variable_scope(scope, reuse=reuse):
inputs_shape = inputs.get_shape()
params_shape = inputs_shape[-1:] mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
beta = tf.get_variable("beta", params_shape, initializer=tf.zeros_initializer())
gamma = tf.get_variable("gamma", params_shape, initializer=tf.ones_initializer())
normalized = (inputs - mean) / ((variance + epsilon) ** (.5))
outputs = gamma * normalized + beta return outputs
在QAnet最后的三个encoder中,各项参数为,其中hidden size为context_query输出的hidden size
encoder(enc[i], num_blocks=7, num_conv_layers=2, kernel_size=5, inputs_mask=input_mask, num_filters=hidden_size, num_heads=8,
scope='Model_Encoder', reuse=True if i > 0 else None, is_training=False, dropout=0.1)
QAnet Encoder的更多相关文章
- QANet
Reading Comprehension(RC) 阅读理解对于机器来说, 是一项非常艰巨的任务.google提出QANet, 目前(2018 0505)一直是SQuAD的No. 1. 今天简单地与大 ...
- Intel Media SDK H264 encoder GOP setting
1 I帧,P帧,B帧,IDR帧,NAL单元 I frame:帧内编码帧,又称intra picture,I 帧通常是每个 GOP(MPEG 所使用的一种视频压缩技术)的第一个帧,经过适度地压缩,做为随 ...
- java url encoder 的一个问题
@RequestMapping(value = {"/search"}) public String errorPath(HttpServletResponse response, ...
- C# 字符编码解码 Encoder 和Decoder
在网络传输和文件操作中,如果数据量很大,需要将其划分为较小的快,此时可能出现一个数据块的末尾是一个不匹配的高代理项,而与其匹配的低代理项在下一个数据块. 这时候使用Encoding的GetBytes方 ...
- C# 与 Microsoft Expression Encoder实现屏幕录制
在日常开发中,我们会经常遇到屏幕录制的需求.在C#中可以通过Expression Encoder的SDK实现这样的需求.首先需要下载Expression Encoder SDK,实现代码: priva ...
- NGif, Animated GIF Encoder for .NET
1.简介 链接: http://www.codeproject.com/Articles/11505/NGif-Animated-GIF-Encoder-for-NET 2.代码使用 1)多个Imag ...
- 是否允许处理由Zend Encoder加密的PHP文件
Zend Optimizer是由PHP核心引擎"Zend"创建者Zend技术公司所开的免费PHP优化软件.据Zend公司透露使用这个软件某些情况下至少可以提高性能30%以上!Zen ...
- 自定义Encoder/Decoder进行对象传递
转载:http://blog.csdn.net/top_code/article/details/50901623 在上一篇文章中,我们使用Netty4本身自带的ObjectDecoder,Objec ...
- expression encoder 4 安装 出现“已经安排重启您的计算机
问题: expression encoder 4 安装 出现“已经安排重启您的计算机 解决的办法,注册表数据的修改 开始 运行 regedit HKEY_LOCAL_MACHINE\SYSTEM\C ...
随机推荐
- Web基础了解版03-jQuery
jQuery jQuery,顾名思义,也就是JavaScript和查询(Query)极大地简化了JavaScript开发人员遍历HTML文档.操作DOM.处理事件.执行动画和开发Ajax. jQuer ...
- 记录我的 python 学习历程-Day08 文件的操作
文件操作的初识 用 python 代码对文件进行各种操作. 基本构成: 文件路径:path 打开方式:读.写.追加.读写.写读-- 编码方式:utf-8 / gbk / gb2312-- f = op ...
- 图文结合深入理解JS中的this值
文章目录 Js 中奇妙的this值 1. 初探this 2. this指向总结 2.1 普通函数调用 2.2 对象的方法调用 2.3 构造函数调用 2.4 利用call,apply,bind方法调用函 ...
- 五分钟了解ES6对数值的扩展
文章目录 数值的扩展(ES6) 1. 二进制八进制表示法 2. Number对象 3. Math对象 4. 指数运算符 5. Integer 数据类型 5.1 简介 5.2 运算 数值的扩展(ES6) ...
- python中super的用法实例解析
概念 super作为python的内建函数.主要作用如下: 允许我们避免使用基类 跟随多重继承来使用 实例 在单个继承的场景下,一般使用super来调用基类来实现: 下面是一个例子: class Ma ...
- python探索微信朋友信息
一.itchat itchat是一个开源的微信个人号接口,这一次就用它来来玩玩. 在使用之前,先下载,老规矩通过 pip install itchat 即可安装. 想要获取朋友圈信息,只需要几行代码就 ...
- 通过 Drone Rest API 获取构建记录日志
Drone是一款CICD工具,提供rest API,简单介绍下如何使用API 获取构建日志. 获取token 登录进入drone,点头像,在菜单里选择token 复制token即可 API 介绍 Dr ...
- JavaWeb学习——页面跳转方式
JavaWeb学习——页面跳转方式 摘要:本文主要学习了请求转发和响应重定向,以及两者之间的区别. 请求转发 相关方法 使用HttpServletRequest对象的 getRequestDispat ...
- 1.Python 简单输入输出
1 读取:input() 1.1 简单打印内容 In [1]: input('你好,请输入你的名字:') 你好,请输入你的名字:小明 1.2 保存输入内容 In [2]: CN_Name = inpu ...
- ESLint + Prettier + husky + lint-staged 规范统一前端代码风格
写在前面: ESLint: Find and fix problems in your JavaScript code. Prettier: Prettier is an opinionated co ...