QAnet Encoder
#!/usr/bin/python3
# -*- coding: utf-8 -*-
'''
date: 2019/8/19
mail: cally.maxiong@gmail.com
blog: http://www.cnblogs.com/callyblog/
'''
import math
import tensorflow as tf __all__ = ['encoder'] initializer_relu = lambda: tf.contrib.layers.variance_scaling_initializer(factor=2.0,
mode='FAN_IN',
uniform=False,
dtype=tf.float32)
regularizer = tf.contrib.layers.l2_regularizer(scale=3e-7) def encoder(inputs, num_blocks, num_conv_layers, kernel_size, inputs_mask, num_filters=128, input_projection=False,
num_heads=8, is_training=False, reuse=None, dropout=0.0, scope="res_block"):
"""
QAnet encoder
:param inputs: inputs
:param num_blocks: number of conv and self attention block
:param num_conv_layers: number of layers of each conv block
:param kernel_size: kernel size
:param inputs_mask: input mask
:param num_filters: number of conv filters
:param input_projection: whether add linear before through conv and self attention block
:param num_heads: self attention number of heads
:param is_training: whether training
:param reuse: whether reuse variable
:param dropout: dropout rate
:param scope: scope name
"""
with tf.variable_scope(scope, reuse=reuse):
if input_projection:
inputs = tf.layers.conv1d(inputs, filters=num_filters, kernel_size=1, use_bias=False, reuse=reuse, name='input_projection') outputs = inputs for i in range(num_blocks):
outputs = _add_timing_signal_1d(outputs)
outputs = _conv_block(outputs, num_conv_layers, kernel_size, num_filters, reuse=reuse, is_training=is_training,
dropout=dropout, scope="conv_block%d" % i) outputs = _multihead_attention(outputs, inputs_mask, dropout_rate=dropout, num_heads=num_heads,
training=is_training, reuse=reuse, scope="self_attention_layers%d" % i)
return outputs def _add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
"""Adds a bunch of sinusoids of different frequencies to a Tensor.
Each channel of the input Tensor is incremented by a sinusoid of a different
frequency and phase.
This allows attention to learn to use absolute and relative positions.
Timing signals should be added to some precursors of both the query and the
memory inputs to attention.
The use of relative position is possible because sin(x+y) and cos(x+y) can be
experessed in terms of y, sin(x) and cos(x).
In particular, we use a geometric sequence of timescales starting with
min_timescale and ending with max_timescale. The number of different
timescales is equal to channels / 2. For each timescale, we
generate the two sinusoidal signals sin(timestep/timescale) and
cos(timestep/timescale). All of these sinusoids are concatenated in
the channels dimension.
Args:
x: a Tensor with shape [batch, length, channels]
min_timescale: a float
max_timescale: a float
Returns:
a Tensor the same shape as x.
"""
length = tf.shape(x)[1]
channels = tf.shape(x)[2]
signal = _get_timing_signal_1d(length, channels, min_timescale, max_timescale)
return x + signal def _get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
"""Gets a bunch of sinusoids of different frequencies.
Each channel of the input Tensor is incremented by a sinusoid of a different
frequency and phase.
This allows attention to learn to use absolute and relative positions.
Timing signals should be added to some precursors of both the query and the
memory inputs to attention.
The use of relative position is possible because sin(x+y) and cos(x+y) can be
experessed in terms of y, sin(x) and cos(x).
In particular, we use a geometric sequence of timescales starting with
min_timescale and ending with max_timescale. The number of different
timescales is equal to channels / 2. For each timescale, we
generate the two sinusoidal signals sin(timestep/timescale) and
cos(timestep/timescale). All of these sinusoids are concatenated in
the channels dimension.
Args:
length: scalar, length of timing signal sequence.
channels: scalar, size of timing embeddings to create. The number of
different timescales is equal to channels / 2.
min_timescale: a float
max_timescale: a float
Returns:
a Tensor of timing signals [1, length, channels]
"""
position = tf.to_float(tf.range(length))
num_timescales = channels // 2
log_timescale_increment = (
math.log(float(max_timescale) / float(min_timescale)) /
(tf.to_float(num_timescales) - 1))
inv_timescales = min_timescale * tf.exp(
tf.to_float(tf.range(num_timescales)) * -log_timescale_increment)
scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0)
signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
signal = tf.pad(signal, [[0, 0], [0, tf.mod(channels, 2)]])
signal = tf.reshape(signal, [1, length, channels])
return signal def _conv_block(inputs, num_conv_layers, kernel_size, num_filters, scope="conv_block", is_training=False, reuse=None,
dropout=0.0):
"""
conv block, contain depth wise separable convolution and conv block
:param inputs: inputs
:param num_conv_layers: number of conv layers
:param kernel_size: conv kernel size
:param num_filters: number of conv filters
:param scope: scope name
:param is_training: whether training
:param reuse: whether reuse variable
:param dropout: dropout rate
"""
with tf.variable_scope(scope, reuse=reuse):
outputs = tf.expand_dims(inputs, 2) for i in range(num_conv_layers):
residual = outputs
outputs = _ln(outputs, scope="layer_norm_%d" % i, reuse=reuse) if i % 2 == 0 and is_training:
outputs = tf.layers.dropout(outputs, dropout, training=is_training) outputs = _depthwise_separable_convolution(outputs, kernel_size=(kernel_size, 1), num_filters=num_filters,
scope="depthwise_conv_layers_%d" % i, reuse=reuse) outputs = tf.layers.dropout(outputs, dropout, training=is_training)
outputs = outputs + residual return tf.squeeze(outputs, 2) def _depthwise_separable_convolution(inputs, kernel_size, num_filters, bias=True, reuse=None,
scope="depthwise_separable_convolution"):
"""
depth wise separable convolution
:param inputs: input
:param kernel_size: kernel size
:param num_filters: number of filter
:param bias: whether use bias
:param reuse: whether reuse variable
:param scope: scope name
"""
with tf.variable_scope(scope, reuse=reuse):
shapes = inputs.shape.as_list()
depthwise_filter = tf.get_variable("depthwise_filter",
(kernel_size[0], kernel_size[1], shapes[-1], 1),
dtype=tf.float32,
regularizer=regularizer,
initializer=initializer_relu())
pointwise_filter = tf.get_variable("pointwise_filter",
(1, 1, shapes[-1], num_filters),
dtype=tf.float32,
regularizer=regularizer,
initializer=initializer_relu())
outputs = tf.nn.separable_conv2d(inputs,
depthwise_filter,
pointwise_filter,
strides=(1, 1, 1, 1),
padding="SAME") if bias:
b = tf.get_variable("bias",
outputs.shape[-1],
regularizer=regularizer,
initializer=tf.zeros_initializer())
outputs += b
outputs = tf.nn.relu(outputs)
return outputs def _multihead_attention(inputs,
input_mask,
num_heads=8,
dropout_rate=0.0,
training=False,
reuse=None,
scope="multihead_attention"):
'''Applies multihead attention. See 3.2.2
inputs: A 3d tensor with shape of [N, T, d_model].
input_mask: A 3d tensor with shape of [N, T].
num_heads: An int. Number of heads.
dropout_rate: A floating point number.
training: Boolean. Controller of mechanism for dropout.
causality: Boolean. If true, units that reference the future are masked.
scope: Optional scope for `variable_scope`. Returns
A 3d tensor with shape of (N, T_q, C)
''' with tf.variable_scope(scope, reuse=reuse):
inputs = inputs * tf.cast(tf.expand_dims(input_mask, axis=-1), dtype=tf.float32)
inputs = _ln(inputs, reuse=reuse, scope=scope+'_layer_normal') queries = inputs
keys = inputs
values = inputs d_model = queries.get_shape().as_list()[-1]
# Linear projections
Q = tf.layers.dense(queries, d_model, use_bias=False, reuse=reuse) # (N, T_q, d_model)
K = tf.layers.dense(keys, d_model, use_bias=False, reuse=reuse) # (N, T_k, d_model)
V = tf.layers.dense(values, d_model, use_bias=False, reuse=reuse) # (N, T_k, d_model) # Split and concat
Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, d_model/h)
K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, d_model/h)
V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, d_model/h) # Attention
outputs = _scaled_dot_product_attention(Q_, K_, V_, dropout_rate, training, reuse=reuse) # Restore shape
outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2) # (N, T_q, d_model) # feed forward
outputs = tf.layers.conv1d(outputs, filters=d_model, kernel_size=1, reuse=reuse, trainable=training)
outputs = tf.layers.dropout(outputs, dropout_rate, training=training) # Residual connection
outputs = queries + outputs # Normalize
outputs = _ln(outputs, reuse=reuse, scope='feed_forword_layer_normal') return outputs def _scaled_dot_product_attention(Q, K, V,
dropout_rate=0.,
training=False,
reuse=None,
scope="scaled_dot_product_attention"):
'''See 3.2.1.
Q: Packed queries. 3d tensor. [N, T_q, d_k].
K: Packed keys. 3d tensor. [N, T_k, d_k].
V: Packed values. 3d tensor. [N, T_k, d_v].
causality: If True, applies masking for future blinding
dropout_rate: A floating point number of [0, 1].
training: boolean for controlling droput
scope: Optional scope for `variable_scope`.
'''
with tf.variable_scope(scope, reuse=reuse):
d_k = Q.get_shape().as_list()[-1] # dot product
outputs = tf.matmul(Q, tf.transpose(K, [0, 2, 1])) # (N, T_q, T_k) # scale
outputs /= d_k ** 0.5 # key masking, delete key 0
outputs = _mask(outputs, Q, K, type="key") # softmax
outputs = tf.nn.softmax(outputs)
attention = tf.transpose(outputs, [0, 2, 1])
tf.summary.image("attention", tf.expand_dims(attention[:1], -1)) # query masking, delete query <pad>
outputs = _mask(outputs, Q, K, type="query") # dropout
outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=training) # weighted sum (context vectors)
outputs = tf.matmul(outputs, V) # (N, T_q, d_v) return outputs def _mask(inputs, queries=None, keys=None, type=None):
"""Masks paddings on keys or queries to inputs
inputs: 3d tensor. (N, T_q, T_k)
queries: 3d tensor. (N, T_q, d)
keys: 3d tensor. (N, T_k, d) e.g.,
>> queries = tf.constant([[[1.],
[2.],
[0.]]], tf.float32) # (1, 3, 1)
>> keys = tf.constant([[[4.],
[0.]]], tf.float32) # (1, 2, 1)
>> inputs = tf.constant([[[4., 0.],
[8., 0.],
[0., 0.]]], tf.float32)
>> mask(inputs, queries, keys, "key")
array([[[ 4.0000000e+00, -4.2949673e+09],
[ 8.0000000e+00, -4.2949673e+09],
[ 0.0000000e+00, -4.2949673e+09]]], dtype=float32)
>> inputs = tf.constant([[[1., 0.],
[1., 0.],
[1., 0.]]], tf.float32)
>> mask(inputs, queries, keys, "query")
array([[[1., 0.],
[1., 0.],
[0., 0.]]], dtype=float32)
"""
outputs = None
padding_num = -2 ** 32 + 1
if type in ("k", "key", "keys"):
# Generate masks
masks = tf.sign(tf.reduce_sum(tf.abs(keys), axis=-1)) # (N, T_k)
masks = tf.expand_dims(masks, 1) # (N, 1, T_k)
masks = tf.tile(masks, [1, tf.shape(queries)[1], 1]) # (N, T_q, T_k) # Apply masks to inputs
paddings = tf.ones_like(inputs) * padding_num outputs = tf.where(tf.equal(masks, 0), paddings, inputs) # (N, T_q, T_k)
elif type in ("q", "query", "queries"):
# Generate masks
masks = tf.sign(tf.reduce_sum(tf.abs(queries), axis=-1)) # (N, T_q)
masks = tf.expand_dims(masks, -1) # (N, T_q, 1)
masks = tf.tile(masks, [1, 1, tf.shape(keys)[1]]) # (N, T_q, T_k) # Apply masks to inputs
outputs = inputs*masks
else:
print("Check if you entered type correctly!") return outputs def _ln(inputs, epsilon=1e-6, reuse=None, scope="ln"):
'''Applies layer normalization. See https://arxiv.org/abs/1607.06450.
inputs: A tensor with 2 or more dimensions, where the first dimension has `batch_size`.
epsilon: A floating number. A very small number for preventing ZeroDivision Error.
scope: Optional scope for `variable_scope`. Returns:
A tensor with the same shape and data dtype as `inputs`.
'''
with tf.variable_scope(scope, reuse=reuse):
inputs_shape = inputs.get_shape()
params_shape = inputs_shape[-1:] mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
beta = tf.get_variable("beta", params_shape, initializer=tf.zeros_initializer())
gamma = tf.get_variable("gamma", params_shape, initializer=tf.ones_initializer())
normalized = (inputs - mean) / ((variance + epsilon) ** (.5))
outputs = gamma * normalized + beta return outputs
在QAnet最后的三个encoder中,各项参数为,其中hidden size为context_query输出的hidden size
encoder(enc[i], num_blocks=7, num_conv_layers=2, kernel_size=5, inputs_mask=input_mask, num_filters=hidden_size, num_heads=8,
scope='Model_Encoder', reuse=True if i > 0 else None, is_training=False, dropout=0.1)
QAnet Encoder的更多相关文章
- QANet
Reading Comprehension(RC) 阅读理解对于机器来说, 是一项非常艰巨的任务.google提出QANet, 目前(2018 0505)一直是SQuAD的No. 1. 今天简单地与大 ...
- Intel Media SDK H264 encoder GOP setting
1 I帧,P帧,B帧,IDR帧,NAL单元 I frame:帧内编码帧,又称intra picture,I 帧通常是每个 GOP(MPEG 所使用的一种视频压缩技术)的第一个帧,经过适度地压缩,做为随 ...
- java url encoder 的一个问题
@RequestMapping(value = {"/search"}) public String errorPath(HttpServletResponse response, ...
- C# 字符编码解码 Encoder 和Decoder
在网络传输和文件操作中,如果数据量很大,需要将其划分为较小的快,此时可能出现一个数据块的末尾是一个不匹配的高代理项,而与其匹配的低代理项在下一个数据块. 这时候使用Encoding的GetBytes方 ...
- C# 与 Microsoft Expression Encoder实现屏幕录制
在日常开发中,我们会经常遇到屏幕录制的需求.在C#中可以通过Expression Encoder的SDK实现这样的需求.首先需要下载Expression Encoder SDK,实现代码: priva ...
- NGif, Animated GIF Encoder for .NET
1.简介 链接: http://www.codeproject.com/Articles/11505/NGif-Animated-GIF-Encoder-for-NET 2.代码使用 1)多个Imag ...
- 是否允许处理由Zend Encoder加密的PHP文件
Zend Optimizer是由PHP核心引擎"Zend"创建者Zend技术公司所开的免费PHP优化软件.据Zend公司透露使用这个软件某些情况下至少可以提高性能30%以上!Zen ...
- 自定义Encoder/Decoder进行对象传递
转载:http://blog.csdn.net/top_code/article/details/50901623 在上一篇文章中,我们使用Netty4本身自带的ObjectDecoder,Objec ...
- expression encoder 4 安装 出现“已经安排重启您的计算机
问题: expression encoder 4 安装 出现“已经安排重启您的计算机 解决的办法,注册表数据的修改 开始 运行 regedit HKEY_LOCAL_MACHINE\SYSTEM\C ...
随机推荐
- 区块链学习——HyperLedger-Fabric v0.6环境搭建详细教程
v0.6 的架构相对简单,适合作为实验或学习来使用. 一.环境准备 一台云服务器(笔者使用的是阿里云的1核-2GB内存) Go语言环境 Docker安装 docker-compose安装 二.环境搭建 ...
- DefinePlugin插件用法
作者:水涛 座右铭:天行健,君子以自强不息 自白:我写博文上来蹭蹭就是干,我突然觉得我需要幽默一点了,好了,下面我们说正经的 一.官方定义: DefinePlugin DefinePlugin 允许创 ...
- dev gridcontrol 行号
/// <summary> /// GridView 显示行号 设置行号列的宽度 /// </summary> /// <param name="gv" ...
- 集群环境下,你不得不注意的ASP.NET Core Data Protection 机制
引言 最近线上环境遇到一个问题,就是ASP.NET Core Web应用在单个容器使用正常,扩展多个容器无法访问的问题.查看容器日志,发现以下异常: System.Security.Cryptogra ...
- Mysql数据库的主从与主主
前言: 首先,我们得知道我们为什么要做数据库的主从与主主,而后,再讨论他们的优劣与特点:为什么要做主从呢?因为Mysql数据库没有增量备份的机制,当数据量太大的时候备份是个难以解决的问题.但是mysq ...
- 剑指offer题解(Java版)
剑指offer题解(Java版) 从尾到头打印链表 题目描述 输入一个链表,按从尾到头的顺序返回一个ArrayList. 方法1:用一个栈保存从头到尾访问链表的每个结点的值,然后按出栈顺序将各个值存入 ...
- Android 音视频技术之录音获取实时音量
一.实时音量相关基础知识 说到获取音量,大家首先想到的应该就是分贝(dB),分贝是一个相对单位(是一个比值,是一个数值,是一个纯计数方法). 在音频领域dB度量的是声音的强度,其计算的公式如下: 在上 ...
- DG重启之后主备数据不同步
问题描述:本来配置好的DG第二天重启之后,发现主备库数据不能同步,在主库上执行日志切换以及创建表操作都传不到备库上,造成这种错误的原因是主库实例断掉后造成备库日志与主库无法实时接收 主库:orcl ...
- <深度学习>Tensorflow遇到的坑之一
AttributeError: module 'tensorflow' has no attribute 'random_normal' AttributeError: module 'tensorf ...
- JMeter多脚本间的启动延时
JMeter做压测时,当需要多个jmx脚本依次执行时,需要用到“启动延时”,即间隔可设置的时间后启动运行下一个jmx脚本. 实现“启动延时”的方法有2个. 方法一.利用JMeter线程组中的" ...