QAnet Encoder

#!/usr/bin/python3
# -*- coding: utf-8 -*-
'''
date: 2019/8/19
mail: cally.maxiong@gmail.com
blog: http://www.cnblogs.com/callyblog/
'''
import math
import tensorflow as tf

__all__ = ['encoder']

initializer_relu = lambda: tf.contrib.layers.variance_scaling_initializer(factor=2.0,
                                                             mode='FAN_IN',
                                                             uniform=False,
                                                             dtype=tf.float32)
regularizer = tf.contrib.layers.l2_regularizer(scale=3e-7)

def encoder(inputs, num_blocks, num_conv_layers, kernel_size, inputs_mask, num_filters=128, input_projection=False,
            num_heads=8, is_training=False, reuse=None, dropout=0.0, scope="res_block"):
    """
    QAnet encoder
    :param inputs: inputs
    :param num_blocks: number of conv and self attention block
    :param num_conv_layers: number of layers of each conv block
    :param kernel_size: kernel size
    :param inputs_mask: input mask
    :param num_filters: number of conv filters
    :param input_projection: whether add linear before through conv and self attention block
    :param num_heads: self attention number of heads
    :param is_training: whether training
    :param reuse: whether reuse variable
    :param dropout: dropout rate
    :param scope: scope name
    """
    with tf.variable_scope(scope, reuse=reuse):
        if input_projection:
            inputs = tf.layers.conv1d(inputs, filters=num_filters, kernel_size=1, use_bias=False, reuse=reuse, name='input_projection')

        outputs = inputs

        for i in range(num_blocks):
            outputs = _add_timing_signal_1d(outputs)
            outputs = _conv_block(outputs, num_conv_layers, kernel_size, num_filters, reuse=reuse, is_training=is_training,
                                  dropout=dropout, scope="conv_block%d" % i)

            outputs = _multihead_attention(outputs, inputs_mask, dropout_rate=dropout, num_heads=num_heads,
                                           training=is_training, reuse=reuse, scope="self_attention_layers%d" % i)
        return outputs

def _add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
    """Adds a bunch of sinusoids of different frequencies to a Tensor.
    Each channel of the input Tensor is incremented by a sinusoid of a different
    frequency and phase.
    This allows attention to learn to use absolute and relative positions.
    Timing signals should be added to some precursors of both the query and the
    memory inputs to attention.
    The use of relative position is possible because sin(x+y) and cos(x+y) can be
    experessed in terms of y, sin(x) and cos(x).
    In particular, we use a geometric sequence of timescales starting with
    min_timescale and ending with max_timescale.  The number of different
    timescales is equal to channels / 2. For each timescale, we
    generate the two sinusoidal signals sin(timestep/timescale) and
    cos(timestep/timescale).  All of these sinusoids are concatenated in
    the channels dimension.
    Args:
    x: a Tensor with shape [batch, length, channels]
    min_timescale: a float
    max_timescale: a float
    Returns:
    a Tensor the same shape as x.
    """
    length = tf.shape(x)[1]
    channels = tf.shape(x)[2]
    signal = _get_timing_signal_1d(length, channels, min_timescale, max_timescale)
    return x + signal

def _get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
    """Gets a bunch of sinusoids of different frequencies.
    Each channel of the input Tensor is incremented by a sinusoid of a different
    frequency and phase.
    This allows attention to learn to use absolute and relative positions.
    Timing signals should be added to some precursors of both the query and the
    memory inputs to attention.
    The use of relative position is possible because sin(x+y) and cos(x+y) can be
    experessed in terms of y, sin(x) and cos(x).
    In particular, we use a geometric sequence of timescales starting with
    min_timescale and ending with max_timescale.  The number of different
    timescales is equal to channels / 2. For each timescale, we
    generate the two sinusoidal signals sin(timestep/timescale) and
    cos(timestep/timescale).  All of these sinusoids are concatenated in
    the channels dimension.
    Args:
    length: scalar, length of timing signal sequence.
    channels: scalar, size of timing embeddings to create. The number of
        different timescales is equal to channels / 2.
    min_timescale: a float
    max_timescale: a float
    Returns:
    a Tensor of timing signals [1, length, channels]
    """
    position = tf.to_float(tf.range(length))
    num_timescales = channels // 2
    log_timescale_increment = (
        math.log(float(max_timescale) / float(min_timescale)) /
            (tf.to_float(num_timescales) - 1))
    inv_timescales = min_timescale * tf.exp(
        tf.to_float(tf.range(num_timescales)) * -log_timescale_increment)
    scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0)
    signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
    signal = tf.pad(signal, [[0, 0], [0, tf.mod(channels, 2)]])
    signal = tf.reshape(signal, [1, length, channels])
    return signal

def _conv_block(inputs, num_conv_layers, kernel_size, num_filters, scope="conv_block", is_training=False, reuse=None,
                dropout=0.0):
    """
    conv block, contain depth wise separable convolution and conv block
    :param inputs: inputs
    :param num_conv_layers: number of conv layers
    :param kernel_size: conv kernel size
    :param num_filters: number of conv filters
    :param scope: scope name
    :param is_training: whether training
    :param reuse: whether reuse variable
    :param dropout: dropout rate
    """
    with tf.variable_scope(scope, reuse=reuse):
        outputs = tf.expand_dims(inputs, 2)

        for i in range(num_conv_layers):
            residual = outputs
            outputs = _ln(outputs, scope="layer_norm_%d" % i, reuse=reuse)

            if i % 2 == 0 and is_training:
                outputs = tf.layers.dropout(outputs, dropout, training=is_training)

            outputs = _depthwise_separable_convolution(outputs, kernel_size=(kernel_size, 1), num_filters=num_filters,
                                                       scope="depthwise_conv_layers_%d" % i, reuse=reuse)

            outputs = tf.layers.dropout(outputs, dropout, training=is_training)
            outputs = outputs + residual

        return tf.squeeze(outputs, 2)

def _depthwise_separable_convolution(inputs, kernel_size, num_filters, bias=True, reuse=None,
                                     scope="depthwise_separable_convolution"):
    """
    depth wise separable convolution
    :param inputs: input
    :param kernel_size: kernel size
    :param num_filters: number of filter
    :param bias: whether use bias
    :param reuse: whether reuse variable
    :param scope: scope name
    """
    with tf.variable_scope(scope, reuse=reuse):
        shapes = inputs.shape.as_list()
        depthwise_filter = tf.get_variable("depthwise_filter",
                                        (kernel_size[0], kernel_size[1], shapes[-1], 1),
                                        dtype=tf.float32,
                                        regularizer=regularizer,
                                        initializer=initializer_relu())
        pointwise_filter = tf.get_variable("pointwise_filter",
                                        (1, 1, shapes[-1], num_filters),
                                        dtype=tf.float32,
                                        regularizer=regularizer,
                                        initializer=initializer_relu())
        outputs = tf.nn.separable_conv2d(inputs,
                                        depthwise_filter,
                                        pointwise_filter,
                                        strides=(1, 1, 1, 1),
                                        padding="SAME")

        if bias:
            b = tf.get_variable("bias",
                                outputs.shape[-1],
                                regularizer=regularizer,
                                initializer=tf.zeros_initializer())
            outputs += b
        outputs = tf.nn.relu(outputs)
        return outputs

def _multihead_attention(inputs,
                         input_mask,
                         num_heads=8,
                         dropout_rate=0.0,
                         training=False,
                         reuse=None,
                         scope="multihead_attention"):
    '''Applies multihead attention. See 3.2.2
    inputs: A 3d tensor with shape of [N, T, d_model].
    input_mask: A 3d tensor with shape of [N, T].
    num_heads: An int. Number of heads.
    dropout_rate: A floating point number.
    training: Boolean. Controller of mechanism for dropout.
    causality: Boolean. If true, units that reference the future are masked.
    scope: Optional scope for `variable_scope`.

    Returns
      A 3d tensor with shape of (N, T_q, C)
    '''

    with tf.variable_scope(scope, reuse=reuse):
        inputs = inputs * tf.cast(tf.expand_dims(input_mask, axis=-1), dtype=tf.float32)
        inputs = _ln(inputs, reuse=reuse, scope=scope+'_layer_normal')

        queries = inputs
        keys = inputs
        values = inputs

        d_model = queries.get_shape().as_list()[-1]
        # Linear projections
        Q = tf.layers.dense(queries, d_model, use_bias=False, reuse=reuse)  # (N, T_q, d_model)
        K = tf.layers.dense(keys, d_model, use_bias=False, reuse=reuse)  # (N, T_k, d_model)
        V = tf.layers.dense(values, d_model, use_bias=False, reuse=reuse)  # (N, T_k, d_model)

        # Split and concat
        Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0)  # (h*N, T_q, d_model/h)
        K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0)  # (h*N, T_k, d_model/h)
        V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0)  # (h*N, T_k, d_model/h)

        # Attention
        outputs = _scaled_dot_product_attention(Q_, K_, V_, dropout_rate, training, reuse=reuse)

        # Restore shape
        outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2)  # (N, T_q, d_model)

        # feed forward
        outputs = tf.layers.conv1d(outputs, filters=d_model, kernel_size=1, reuse=reuse, trainable=training)
        outputs = tf.layers.dropout(outputs, dropout_rate, training=training)

        # Residual connection
        outputs = queries + outputs

        # Normalize
        outputs = _ln(outputs, reuse=reuse, scope='feed_forword_layer_normal')

    return outputs

def _scaled_dot_product_attention(Q, K, V,
                                  dropout_rate=0.,
                                  training=False,
                                  reuse=None,
                                  scope="scaled_dot_product_attention"):
    '''See 3.2.1.
    Q: Packed queries. 3d tensor. [N, T_q, d_k].
    K: Packed keys. 3d tensor. [N, T_k, d_k].
    V: Packed values. 3d tensor. [N, T_k, d_v].
    causality: If True, applies masking for future blinding
    dropout_rate: A floating point number of [0, 1].
    training: boolean for controlling droput
    scope: Optional scope for `variable_scope`.
    '''
    with tf.variable_scope(scope, reuse=reuse):
        d_k = Q.get_shape().as_list()[-1]

        # dot product
        outputs = tf.matmul(Q, tf.transpose(K, [0, 2, 1]))  # (N, T_q, T_k)

        # scale
        outputs /= d_k ** 0.5

        # key masking, delete key 0
        outputs = _mask(outputs, Q, K, type="key")

        # softmax
        outputs = tf.nn.softmax(outputs)
        attention = tf.transpose(outputs, [0, 2, 1])
        tf.summary.image("attention", tf.expand_dims(attention[:1], -1))

        # query masking, delete query <pad>
        outputs = _mask(outputs, Q, K, type="query")

        # dropout
        outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=training)

        # weighted sum (context vectors)
        outputs = tf.matmul(outputs, V)  # (N, T_q, d_v)

    return outputs

def _mask(inputs, queries=None, keys=None, type=None):
    """Masks paddings on keys or queries to inputs
    inputs: 3d tensor. (N, T_q, T_k)
    queries: 3d tensor. (N, T_q, d)
    keys: 3d tensor. (N, T_k, d)

    e.g.,
    >> queries = tf.constant([[[1.],
                        [2.],
                        [0.]]], tf.float32) # (1, 3, 1)
    >> keys = tf.constant([[[4.],
                     [0.]]], tf.float32)  # (1, 2, 1)
    >> inputs = tf.constant([[[4., 0.],
                               [8., 0.],
                               [0., 0.]]], tf.float32)
    >> mask(inputs, queries, keys, "key")
    array([[[ 4.0000000e+00, -4.2949673e+09],
        [ 8.0000000e+00, -4.2949673e+09],
        [ 0.0000000e+00, -4.2949673e+09]]], dtype=float32)
    >> inputs = tf.constant([[[1., 0.],
                             [1., 0.],
                              [1., 0.]]], tf.float32)
    >> mask(inputs, queries, keys, "query")
    array([[[1., 0.],
        [1., 0.],
        [0., 0.]]], dtype=float32)
    """
    outputs = None
    padding_num = -2 ** 32 + 1
    if type in ("k", "key", "keys"):
        # Generate masks
        masks = tf.sign(tf.reduce_sum(tf.abs(keys), axis=-1))  # (N, T_k)
        masks = tf.expand_dims(masks, 1) # (N, 1, T_k)
        masks = tf.tile(masks, [1, tf.shape(queries)[1], 1])  # (N, T_q, T_k)

        # Apply masks to inputs
        paddings = tf.ones_like(inputs) * padding_num

        outputs = tf.where(tf.equal(masks, 0), paddings, inputs)  # (N, T_q, T_k)
    elif type in ("q", "query", "queries"):
        # Generate masks
        masks = tf.sign(tf.reduce_sum(tf.abs(queries), axis=-1))  # (N, T_q)
        masks = tf.expand_dims(masks, -1)  # (N, T_q, 1)
        masks = tf.tile(masks, [1, 1, tf.shape(keys)[1]])  # (N, T_q, T_k)

        # Apply masks to inputs
        outputs = inputs*masks
    else:
        print("Check if you entered type correctly!")

    return outputs

def _ln(inputs, epsilon=1e-6, reuse=None, scope="ln"):
    '''Applies layer normalization. See https://arxiv.org/abs/1607.06450.
    inputs: A tensor with 2 or more dimensions, where the first dimension has `batch_size`.
    epsilon: A floating number. A very small number for preventing ZeroDivision Error.
    scope: Optional scope for `variable_scope`.

    Returns:
      A tensor with the same shape and data dtype as `inputs`.
    '''
    with tf.variable_scope(scope, reuse=reuse):
        inputs_shape = inputs.get_shape()
        params_shape = inputs_shape[-1:]

        mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
        beta = tf.get_variable("beta", params_shape, initializer=tf.zeros_initializer())
        gamma = tf.get_variable("gamma", params_shape, initializer=tf.ones_initializer())
        normalized = (inputs - mean) / ((variance + epsilon) ** (.5))
        outputs = gamma * normalized + beta

    return outputs

在QAnet最后的三个encoder中，各项参数为，其中hidden size为context_query输出的hidden size

encoder(enc[i], num_blocks=7, num_conv_layers=2, kernel_size=5, inputs_mask=input_mask, num_filters=hidden_size, num_heads=8,

                             scope='Model_Encoder', reuse=True if i > 0 else None, is_training=False, dropout=0.1)

QAnet Encoder的更多相关文章

QANet
Reading Comprehension(RC) 阅读理解对于机器来说, 是一项非常艰巨的任务.google提出QANet, 目前(2018 0505)一直是SQuAD的No. 1. 今天简单地与大 ...
Intel Media SDK H264 encoder GOP setting
1 I帧,P帧,B帧,IDR帧,NAL单元 I frame:帧内编码帧,又称intra picture,I 帧通常是每个 GOP(MPEG 所使用的一种视频压缩技术)的第一个帧,经过适度地压缩,做为随 ...
java url encoder 的一个问题
@RequestMapping(value = {"/search"}) public String errorPath(HttpServletResponse response, ...
C# 字符编码解码 Encoder 和Decoder
在网络传输和文件操作中,如果数据量很大,需要将其划分为较小的快,此时可能出现一个数据块的末尾是一个不匹配的高代理项,而与其匹配的低代理项在下一个数据块. 这时候使用Encoding的GetBytes方 ...
C# 与 Microsoft Expression Encoder实现屏幕录制
在日常开发中,我们会经常遇到屏幕录制的需求.在C#中可以通过Expression Encoder的SDK实现这样的需求.首先需要下载Expression Encoder SDK,实现代码: priva ...
NGif, Animated GIF Encoder for .NET
1.简介链接: http://www.codeproject.com/Articles/11505/NGif-Animated-GIF-Encoder-for-NET 2.代码使用 1)多个Imag ...
是否允许处理由Zend Encoder加密的PHP文件
Zend Optimizer是由PHP核心引擎"Zend"创建者Zend技术公司所开的免费PHP优化软件.据Zend公司透露使用这个软件某些情况下至少可以提高性能30%以上!Zen ...
自定义Encoder/Decoder进行对象传递
转载:http://blog.csdn.net/top_code/article/details/50901623 在上一篇文章中,我们使用Netty4本身自带的ObjectDecoder,Objec ...
expression encoder 4 安装出现“已经安排重启您的计算机
问题: expression encoder 4 安装出现“已经安排重启您的计算机解决的办法,注册表数据的修改开始运行 regedit HKEY_LOCAL_MACHINE\SYSTEM\C ...

随机推荐

【Maven】使用学习
Maven使用 Maven Jar 搜索:https://mvnrepository.com/ Maven 国内镜像库 <mirror> <id>nexus-aliyun< ...
Vue大纲
Vue框架 Vue ---- vue的基本使用文本/事件/属性指令补充: js面向对象 js函数 Vue ---- 表单指令条件指令循环指令分隔符过滤器计算属性监听属性 Vue --- ...
BBS项目文件
bbs项目文件 # 创建好文件后 git init git remote add origin git@gitee.com:lddragon/bbs3.git git remote git pull ...
java基础集合简介Map（三）下
--Map接口简介今天来看一看map集合,map映射接口,用于存放键值对,<key,value>,通过key来查找value,顾名思义key不能为空,唯一且不重复,不然底层怎么查呢! 可 ...
Microsemi Libero使用技巧——FPGA全局网络的设置
前言刚开始做Microsemi FPGA+SoC开发时,会用到几个ARM专用的IP Core,功能一复杂起来,就会遇到某些信号如rst_n不能分配到指定的引脚上的情况,IO类型为CLKBUF,并不是 ...
tp5.1批量删除商品
选中要删除的商品,点击批量删除先在控制器使用sql语句查出商品信息goods 然后在html源码中使用goods变量. <table> {foreach $goods as $item} ...
OA表单制作（致远）
第一步.导入已经制作好的xnl表单文件. 第二步.对每个字段设置相关属性. 1.设置文本属性,录入类型选择文本框. 2.设置日期属性,录入类型选择日期控件. 3.设置引用类型,录入类型选择关联表单-- ...
Gradle 自定义插件
使用版本 5.6.2 插件被用来封装构建逻辑和一些通用配置.将可重复使用的构建逻辑和默认约定封装到插件里,以便于其他项目使用. 你可以使用你喜欢的语言开发插件,但是最终是要编译成字节码在 JVM 运行 ...
计算机组成原理——输入输出（I/O）系统考研题
(一) I/O系统基本概念 (二) 外部设备 1. 输入设备:键盘.鼠标2. 输出设备:显示器.打印机3. 外存储器:硬盘存储器.磁盘阵列.光盘存储器 (三) I/ ...
RedisSession （自定义）
RedisSession (自定义) 疯狂创客圈 Java 高并发[ 亿级流量聊天室实战]实战系列 [博客园总入口 ] 架构师成长+面试必备之高并发基础书籍 [Netty Zookeeper Red ...

QAnet Encoder

QAnet Encoder的更多相关文章

随机推荐

热门专题