cublas fp16

编译选项： nvcc 4.cpp -o test_gemm -lcudart -lcuda -lcublas -std=c++11

#include <sys/time.h>

#include <cuda_profiler_api.h>

#include <cublas_v2.h>

#include <cuda.h>

#include <cuda_fp16.h>

#include <cuda_runtime.h>

#include <stdio.h>

int8_t float2int8(float f, float scale) {

    int8_t i = int8_t(f * scale);

    if (i < -127) i = -127;

    if (i > 127) i = 127;

    return i;

}

template <typename T, typename S>

void allocate_memory(int m, int n, int k, T **A, T **B, S **C) {

    cudaMallocManaged(A, m * k * sizeof(T));

    cudaMallocManaged(B, k * n * sizeof(T));

    cudaMallocManaged(C, m * n * sizeof(S));

}

template <typename T, typename S>

void free_memory(T *A, T *B, S *C) {

    cudaFree(A);

    cudaFree(B);

    cudaFree(C);

}

template <typename T, typename S>

int cublas_gemm_ex(cublasHandle_t handle, cublasOperation_t transA, cublasOperation_t transB,

                   int m, int n, int k, T *A, T *B, S *C, int lda, int ldb, int ldc,

                   S *alpha, S *beta, int algo) {

    cudaDataType_t AType, BType, CType, ComputeType;

    if (std::is_same<T, float>::value) {

        AType = BType = CType = ComputeType = CUDA_R_32F;

    } else if (std::is_same<T, __half>::value) {

        AType = BType = CType = ComputeType = CUDA_R_16F;

    } else if (std::is_same<T, int8_t>::value) {

        AType = BType = CUDA_R_8I;

        CType = ComputeType = CUDA_R_32I;

    } else {

        printf("Not supported data type.");

        return -1;

    }

    cublasStatus_t status;

    status = cublasGemmEx(handle,

                          transA,

                          transB,

                          m,

                          n,

                          k,

                          alpha,

                          A,

                          AType,

                          lda,

                          B,

                          BType,

                          ldb,

                          beta,

                          C,

                          CType,

                          ldc,

                          ComputeType,

                          static_cast<cublasGemmAlgo_t>(algo));

    if (status == CUBLAS_STATUS_SUCCESS)

        return 1;

    else

        return -1;

}

template <typename T, typename S>

void test_gemm(cublasHandle_t handle, int m, int n, int k, T *A, T *B, S *C,

               S *alpha, S *beta, int algo, int iteration) {

    float total_time = 0;

    for (int i = 0; i < iteration; ++i) {

        struct timeval start, end;

        cudaDeviceSynchronize();

        cudaProfilerStart();

        gettimeofday(&start, NULL);

        int success = cublas_gemm_ex(handle,

                                     CUBLAS_OP_N,

                                     CUBLAS_OP_N,

                                     n,

                                     m,

                                     k,

                                     B,

                                     A,

                                     C,

                                     n,

                                     k,

                                     n,

                                     alpha,

                                     beta,

                                     static_cast<cublasGemmAlgo_t>(algo));

        cudaDeviceSynchronize();

        gettimeofday(&end, NULL);

        cudaProfilerStop();

        if (success > 0 && i > 0)

            total_time += (end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001;

    }

    if (total_time > 0)

        printf("algo %d: %.3f ms\n", algo, total_time / (iteration - 1));

}

int main() {

    int m = 4096, n = 8192, k = 1024;

    printf("shape: (%d, %d) x (%d, %d)\n", m, k, k, n);

    int start_algo = CUBLAS_GEMM_DEFAULT;

    int end_algo = CUBLAS_GEMM_ALGO23;

    int start_algo_t_op = CUBLAS_GEMM_DEFAULT_TENSOR_OP;

    int end_algo_t_op = CUBLAS_GEMM_ALGO15_TENSOR_OP;

    int iteration = 10;

    float *fA, *fB, *fC;

    __half *hA, *hB, *hC;

    int8_t *iA, *iB; int32_t *iC;

    float f_alpha = 1, f_beta = 0;

    __half h_alpha = __float2half_rn(1.0), h_beta = __float2half_rn(0.0);

    int32_t i_alpha = 1, i_beta = 0;

    allocate_memory(m, n, k, &fA, &fB, &fC);

    allocate_memory(m, n, k, &hA, &hB, &hC);

    allocate_memory(m, n, k, &iA, &iB, &iC);

    for (int i = 0; i < m * k; ++i) {

        fA[i] = float(i % 255 - 127) / 127;

        hA[i] = __float2half_rn(fA[i]);

        iA[i] = float2int8(fA[i], 127);

    }

    for (int i = 0; i < k * n; ++i) {

        fB[i] = float(i % 255 - 127) / 127;

        hB[i] = __float2half_rn(fB[i]);

        iB[i] = float2int8(fB[i], 127);

    }

    cublasHandle_t handle;

    cublasCreate(&handle);

    printf(">>>>>>>>>>>>>>>>> test fp32 >>>>>>>>>>>>>>>>>\n");

    for (int algo = start_algo; algo <= end_algo; ++algo)

        test_gemm(handle, m, n, k, fA, fB, fC, &f_alpha, &f_beta, algo, iteration);

    for (int algo = start_algo_t_op; algo <= end_algo_t_op; ++algo)

        test_gemm(handle, m, n, k, fA, fB, fC, &f_alpha, &f_beta, algo, iteration);

    printf(">>>>>>>>>>>>>>>>> test fp16 >>>>>>>>>>>>>>>>>\n");

    for (int algo = start_algo; algo <= end_algo; ++algo)

        test_gemm(handle, m, n, k, hA, hB, hC, &h_alpha, &h_beta, algo, iteration);

    for (int algo = start_algo_t_op; algo <= end_algo_t_op; ++algo)

        test_gemm(handle, m, n, k, hA, hB, hC, &h_alpha, &h_beta, algo, iteration);

    printf(">>>>>>>>>>>>>>>>> test int8 >>>>>>>>>>>>>>>>>\n");

    for (int algo = start_algo; algo <= end_algo; ++algo)

        test_gemm(handle, m, n, k, iA, iB, iC, &i_alpha, &i_beta, algo, iteration);

    for (int algo = start_algo_t_op; algo <= end_algo_t_op; ++algo)

        test_gemm(handle, m, n, k, iA, iB, iC, &i_alpha, &i_beta, algo, iteration);

    printf(">>>>>>>>>>>>>>>>> compare result >>>>>>>>>>>>>>>>>\n");

    printf("fp32: ");

    for (int i = 0; i < 10; ++i)

        printf("%.5f%c", fC[i], " \n"[i==9]);

    printf("fp16: ");

    for (int i = 0; i < 10; ++i)

        printf("%.5f%c", float(hC[i]), " \n"[i==9]);

    printf("int8: ");

    for (int i = 0; i < 10; ++i)

        printf("%.5f%c", float(iC[i])/127/127, " \n"[i==9]);

    free_memory(iA, iB, iC);

    free_memory(fA, fB, fC);

    free_memory(hA, hB, hC);

    return 0;

}

cublas fp16的更多相关文章

在NVIDIA（CUDA，CUBLAS）和Intel MKL上快速实现BERT推理
在NVIDIA(CUDA,CUBLAS)和Intel MKL上快速实现BERT推理直接在NVIDIA(CUDA,CUBLAS)或Intel MKL上进行高度定制和优化的BERT推理,而无需tenso ...
cublas相关的知识
下面链接给出了一个例子,怎么用cublas进行矩阵的运算提速,也说明了cublas的大致的使用方法. http://www.cnblogs.com/scut-fm/p/3756242.html cub ...
使用 CUBLAS 库给矩阵运算提速
前言编写 CUDA 程序真心不是个简单的事儿,调试也不方便,很费时.那么有没有一些现成的 CUDA 库来调用呢? 答案是有的,如 CUBLAS 就是 CUDA 专门用来解决线性代数运算的库. 本文将 ...
有关CUBLAS中的矩阵乘法函数
关于cuBLAS库中矩阵乘法相关的函数及其输入输出进行详细讨论. ▶ 涨姿势: ● cuBLAS中能用于运算矩阵乘法的函数有4个,分别是 cublasSgemm(单精度实数).cublasDgemm( ...
Cublas矩阵加速运算
前言编写 CUDA 程序真心不是个简单的事儿,调试也不方便,很费时.那么有没有一些现成的 CUDA 库来调用呢? 答案是有的,如 CUBLAS 就是 CUDA 专门用来解决线性代数运算的库. 本文将 ...
CUDA报错： Cannot create Cublas handle. Cublas won't be available. 以及：Check failed: status == CUBLAS_STATUS_SUCCESS (1 vs. 0) CUBLAS_STATUS_NOT_INITIALIZED
Error描述: aita@aita-Alienware-Area-51-R5:~/AITA2/daisida/ssd-github/caffe$ make runtest -j8 .build_re ...
第四篇：使用 CUBLAS 库给矩阵运算提速
前言编写 CUDA 程序真心不是个简单的事儿,调试也不方便,很费时.那么有没有一些现成的 CUDA 库来调用呢? 答案是有的,如 CUBLAS 就是 CUDA 专门用来解决线性代数运算的库. 本文将 ...
CUBLAS基础实验
一.概述最近在试图进行cuda并行编程,目标是编写一段矩阵计算代码,将计算结果存储进入GPU的缓冲区当中,并在达到某些要求后强制刷新缓冲区,取得计算结果. 但是考虑时间紧任务重的状况和实际的性能要求 ...
基于深度学习的人脸识别系统系列（Caffe+OpenCV+Dlib）——【四】使用CUBLAS加速计算人脸向量的余弦距离
前言基于深度学习的人脸识别系统,一共用到了5个开源库:OpenCV(计算机视觉库).Caffe(深度学习库).Dlib(机器学习库).libfacedetection(人脸检测库).cudnn(gp ...
使用TensorRT对caffe和pytorch onnx版本的mnist模型进行fp32和fp16 推理 | tensorrt fp32 fp16 tutorial with caffe pytorch minist model
本文首发于个人博客https://kezunlin.me/post/bcdfb73c/,欢迎阅读最新内容! tensorrt fp32 fp16 tutorial with caffe pytorch ...

随机推荐

把逗号分隔的String字符串转List<Integer>
把逗号分隔的String字符串转int集合集合或数组转变为逗号分隔的字符串的几种方式 1.自己编码实现 2.org.apache.commons.lang3.StringUtils 3.String ...
scrapy 爬取酷狗热门歌手音乐
目录声明 Hello,酷狗! 创建一个Scrapy项目 spider模块分析前端界面注意 items模块 pipeline模块处理音频文件自定义下载路径自定义下载图片路径异步存入到数据库 ...
NFS服务器搭建与autofs自动挂载
一．NFS文件详解 1. /data/ 表示需要共享的目录. 2. IP 表示允许哪个客户端访问. 3. IP 后括号 ...
基于AD9361的双收双发射频FMC子卡
FMC177-基于AD9361的双收双发射频FMC子卡一.板卡介绍 FMC177射频模块分别包含两个接收通道与发射通道,其频率可覆盖达到70MHz~6GHz,AD9361芯片提供具有成本效益的实验平 ...
【面试题】面试突击71：GET 和 POST 有什么区别？
携手创作,共同成长!这是我参与「掘金日新计划 · 8 月更文挑战」的第4天,点击查看活动详情 GET 和 POST 是 HTTP 请求中最常用的两种请求方法,在日常开发的 RESTful 接口中,都能 ...
Hadoop搭建超级详解
html, body, div, span, applet, object, iframe, h1, h2, h3, h4, h5, h6, p, blockquote, pre, a, abbr, ...
vim 小记录
将str1批量替换成str2 , 特殊符号前用转译符 \ :%s/str1/str2/g
068_Apex&Page中的ReadOnly 使用方式
一.page页面遇到需要检索大量数据的处理方式需要时会用Readonly 通常,对单个Visualforce页面请求的查询可能无法检索超过50,000行. 在Read only模式下,此限制将放宽允许 ...
Java中类似c语言的printf
System.out.printf("%4d",x); printf("%4d",x); 保留小数点后两位也可以用%.2f 相对来说很好记了回车用\n
js原生判断数字类型
js判断数字类型汇总最近在写代码的时候,有些逻辑需要判断数字类型,等用到的时候才发现自己了解的方法不太严密,然后就决心查资料汇总了解下有哪些方法比较严密第一种:typeof + isNaN使用typ ...

cublas fp16

cublas fp16的更多相关文章

随机推荐

热门专题