CUDA[4] sample program: matrix-vector multiplication

Use Compressed Sparse Row Format (CSR) to represent matrix

 #include "cuda_runtime.h"

 #include "device_launch_parameters.h"

 #include "gputimer.h"

 #include<stdio.h>

 #include<stdlib.h>

 #include<string.h>

 #define WARP_SIZE 32

 __global__ void

 spmv_csr_vector_kernel ( const int num_rows ,

                          const int * ptr ,

                          const int * indices ,

                          const double * data ,

                          const double * x ,

                          double * y)

 {

     __shared__ double vals [WARP_SIZE];

     int thread_id = blockDim.x * blockIdx.x + threadIdx.x ; // global thread index

     int warp_id = thread_id / WARP_SIZE; // global warp index

     int lane = thread_id & (WARP_SIZE - ); // thread index within the warp

     // one warp per row

     int row = warp_id ;

     if ( row < num_rows )

     {

         int row_start = ptr [ row ];

         int row_end = ptr [ row +];

         // compute running sum per thread

         vals [ threadIdx.x ] = ;

         for ( int jj = row_start + lane ; jj < row_end ; jj += WARP_SIZE)

         vals [ threadIdx.x ] += data [ jj ] * x [ indices [ jj ]];

         // parallel reduction in shared memory

         if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];

         if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];

         if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];

         if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];

         if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];

         // first thread writes the result

         if ( lane == )

         y[ row ] += vals [ threadIdx.x ];

     }

 }

 __global__ void

 spmv_csr_scalar_kernel ( const int num_rows ,

                          const int * ptr ,

                          const int * indices ,

                          const double * data ,

                          const double * x ,

                          double * y)

 {

     int row = blockDim.x * blockIdx.x + threadIdx.x ;

     if( row < num_rows )

     {

         double dot = ;

         int row_start = ptr [ row ];

         int row_end = ptr [ row +];

         for (int jj = row_start ; jj < row_end ; jj ++)

             dot += data [ jj ] * x[ indices [ jj ]];

         y[ row ] += dot ;

     }

 }

 int main(int argc,char **argv)

 {

     double h_data[]={,,,,,,,,};

     int h_col[]={,,,,,,,,};

     int h_ptr[]={,,,,};

     double h_x[]={,,,,};

     double h_y[]={,,,};

     int num_rows=;

     double *d_data;

     int *d_col;

     int *d_ptr;

     double *d_x;

     double *d_y;

     cudaMalloc((void**) &d_data,sizeof(double)*);

     cudaMalloc((void**) &d_col,sizeof(int)*);

     cudaMalloc((void**) &d_ptr,sizeof(int)*);

     cudaMalloc((void**) &d_x,sizeof(double)*);

     cudaMalloc((void**) &d_y,sizeof(double)*);

     cudaMemcpy((void*)d_data, (void*)h_data, sizeof(double)*, cudaMemcpyHostToDevice);

     cudaMemcpy((void*)d_col, (void*)h_col, sizeof(int)*, cudaMemcpyHostToDevice);

     cudaMemcpy((void*)d_ptr, (void*)h_ptr, sizeof(int)*, cudaMemcpyHostToDevice);

     cudaMemcpy((void*)d_x, (void*)h_x, sizeof(double)*, cudaMemcpyHostToDevice);

     cudaMemcpy((void*)d_y, (void*)h_y, sizeof(double)*, cudaMemcpyHostToDevice);

     GpuTimer timer;

     timer.Start();

     spmv_csr_vector_kernel<<<num_rows,>>>(num_rows,d_ptr,d_col,d_data,d_x,d_y);

     //spmv_csr_scalar_kernel<<<1,32>>>(num_rows,d_ptr,d_col,d_data,d_x,d_y);

     timer.Stop();

     printf("Duration: %g ms\n",timer.Elapsed());

     cudaMemcpy((void*)h_y, (void*)d_y, sizeof(double)*, cudaMemcpyDeviceToHost);

     for(int i=;i<num_rows;i++)

         printf("%.5f ",h_y[i]);

     printf("\n");

     return ;

 }

ref:

http://www.nvidia.com/docs/IO/66889/nvr-2008-004.pdf　　

ch4.3

CUDA[4] sample program: matrix-vector multiplication的更多相关文章

ACM学习历程——UVA442 Matrix Chain Multiplication（栈）
Description Matrix Chain Multiplication Matrix Chain Multiplication Suppose you have to evaluate ...
Matrix Chain Multiplication[HDU1082]
Matrix Chain Multiplication Time Limit: 2000/1000 MS (Java/Others) Memory Limit: 65536/32768 K (J ...
UVA 442 二十 Matrix Chain Multiplication
Matrix Chain Multiplication Time Limit:3000MS Memory Limit:0KB 64bit IO Format:%lld & %l ...
UVa 442 Matrix Chain Multiplication(矩阵链,模拟栈)
意甲冠军由于矩阵乘法计算链表达的数量,需要的计算后的电流等于行的矩阵的矩阵的列数他们乘足够的人才非法输出error 输入是严格合法的即使仅仅有两个相乘也会用括号括起来并且括号中 ...
Matrix Chain Multiplication(表达式求值用栈操作）
题目链接:http://acm.hdu.edu.cn/showproblem.php?pid=1082 Matrix Chain Multiplication Time Limit: 2000/100 ...
UVA——442 Matrix Chain Multiplication
442 Matrix Chain MultiplicationSuppose you have to evaluate an expression like A*B*C*D*E where A,B,C ...
例题6-3 Matrix Chain Multiplication ，Uva 442
这个题思路没有任何问题,但还是做了近三个小时,其中2个多小时调试得到的经验有以下几点: 一定学会调试,掌握输出中间量的技巧,加强gdb调试的学习有时候代码不对,得到的结果却是对的(之后总结以下常见 ...
UVa442 Matrix Chain Multiplication
// UVa442 Matrix Chain Multiplication // 题意:输入n个矩阵的维度和一些矩阵链乘表达式,输出乘法的次数.假定A和m*n的,B是n*p的,那么AB是m*p的,乘法 ...
uva-442 Matrix Chain Multiplication
Suppose you have to evaluate an expression like A*B*C*D*E where A,B,C,D and E are matrices. Since ma ...

随机推荐

ssl协议相关
<1> SSL版本测试浏览器支持的SSL版本的网站: https://www.ssllabs.com/ssltest/viewMyClient.html 0xfefd (DTLS ...
HTTPS如何保证数据传输的安全性 -- 结合加密
什么是HTTPS: HTTP就是我们平时浏览网页时候使用的一种协议 HTTP协议传输的数据都是未加密的,也就是明文的,因此使用HTTP协议传输隐私信息非常不安全.为了保证这些隐私数据能加密传输,于是网 ...
学习 MeteoInfo二次开发教程（三）
1.breakList的问题 ((PolygonBreak) aLS.breakList[0]).DrawFill=false; 新的类库将LegendScheme的breakList属性改为了Leg ...
H5的本地存储技术及其与Cookie的比较
第一部分: H5的本地存储技术 HTML5 提供了两种在客户端存储数据的新方法.先看下面的例子: 例1:var mySelection = {name:"car", amount: ...
Vue Checkbox全选和选中的方法
<div class="search-content"> <Checkbox :value="checkAll" @click.prevent ...
python-单元测试优化，加入日志
HttpRequests.py #-*- coding:utf-8 -*- import requests class HttpRequests(): def http_requests(self,u ...
说下spring生命周期
面试官:说下spring生命周期程序员:不会那你先回去等消息吧 Bean实现了BeanNameAware,Spring会将Bean的ID透传给setBeanName java.后端开发.程 ...
NetBeans 代码折叠
代码折叠 // <editor-fold> Your code goes here...// </editor-fold> 添加描述 // <editor-fold ...
sybase-sql语法-replace用法
1.去空格 update hyl_temp02 --去空格 set acc_nbr=replace(acc_nbr,' ',''); commit; 2.去回车 update hyl_temp02 - ...
网络抓包工具 wireshark 入门教程
Wireshark Wireshark(前称Ethereal)是一个网络数据包分析软件.网络数据包分析软件的功能是截取网络数据包,并尽可能显示出最为详细的网络数据包数据.Wireshark使用WinP ...

CUDA[4] sample program: matrix-vector multiplication

CUDA[4] sample program: matrix-vector multiplication的更多相关文章

随机推荐

热门专题