CUDA[4] sample program: matrix-vector multiplication

Use Compressed Sparse Row Format (CSR) to represent matrix

 #include "cuda_runtime.h"

 #include "device_launch_parameters.h"

 #include "gputimer.h"

 #include<stdio.h>

 #include<stdlib.h>

 #include<string.h>

 #define WARP_SIZE 32

 __global__ void

 spmv_csr_vector_kernel ( const int num_rows ,

                          const int * ptr ,

                          const int * indices ,

                          const double * data ,

                          const double * x ,

                          double * y)

 {

     __shared__ double vals [WARP_SIZE];

     int thread_id = blockDim.x * blockIdx.x + threadIdx.x ; // global thread index

     int warp_id = thread_id / WARP_SIZE; // global warp index

     int lane = thread_id & (WARP_SIZE - ); // thread index within the warp

     // one warp per row

     int row = warp_id ;

     if ( row < num_rows )

     {

         int row_start = ptr [ row ];

         int row_end = ptr [ row +];

         // compute running sum per thread

         vals [ threadIdx.x ] = ;

         for ( int jj = row_start + lane ; jj < row_end ; jj += WARP_SIZE)

         vals [ threadIdx.x ] += data [ jj ] * x [ indices [ jj ]];

         // parallel reduction in shared memory

         if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];

         if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];

         if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];

         if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];

         if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];

         // first thread writes the result

         if ( lane == )

         y[ row ] += vals [ threadIdx.x ];

     }

 }

 __global__ void

 spmv_csr_scalar_kernel ( const int num_rows ,

                          const int * ptr ,

                          const int * indices ,

                          const double * data ,

                          const double * x ,

                          double * y)

 {

     int row = blockDim.x * blockIdx.x + threadIdx.x ;

     if( row < num_rows )

     {

         double dot = ;

         int row_start = ptr [ row ];

         int row_end = ptr [ row +];

         for (int jj = row_start ; jj < row_end ; jj ++)

             dot += data [ jj ] * x[ indices [ jj ]];

         y[ row ] += dot ;

     }

 }

 int main(int argc,char **argv)

 {

     double h_data[]={,,,,,,,,};

     int h_col[]={,,,,,,,,};

     int h_ptr[]={,,,,};

     double h_x[]={,,,,};

     double h_y[]={,,,};

     int num_rows=;

     double *d_data;

     int *d_col;

     int *d_ptr;

     double *d_x;

     double *d_y;

     cudaMalloc((void**) &d_data,sizeof(double)*);

     cudaMalloc((void**) &d_col,sizeof(int)*);

     cudaMalloc((void**) &d_ptr,sizeof(int)*);

     cudaMalloc((void**) &d_x,sizeof(double)*);

     cudaMalloc((void**) &d_y,sizeof(double)*);

     cudaMemcpy((void*)d_data, (void*)h_data, sizeof(double)*, cudaMemcpyHostToDevice);

     cudaMemcpy((void*)d_col, (void*)h_col, sizeof(int)*, cudaMemcpyHostToDevice);

     cudaMemcpy((void*)d_ptr, (void*)h_ptr, sizeof(int)*, cudaMemcpyHostToDevice);

     cudaMemcpy((void*)d_x, (void*)h_x, sizeof(double)*, cudaMemcpyHostToDevice);

     cudaMemcpy((void*)d_y, (void*)h_y, sizeof(double)*, cudaMemcpyHostToDevice);

     GpuTimer timer;

     timer.Start();

     spmv_csr_vector_kernel<<<num_rows,>>>(num_rows,d_ptr,d_col,d_data,d_x,d_y);

     //spmv_csr_scalar_kernel<<<1,32>>>(num_rows,d_ptr,d_col,d_data,d_x,d_y);

     timer.Stop();

     printf("Duration: %g ms\n",timer.Elapsed());

     cudaMemcpy((void*)h_y, (void*)d_y, sizeof(double)*, cudaMemcpyDeviceToHost);

     for(int i=;i<num_rows;i++)

         printf("%.5f ",h_y[i]);

     printf("\n");

     return ;

 }

ref:

http://www.nvidia.com/docs/IO/66889/nvr-2008-004.pdf　　

ch4.3

CUDA[4] sample program: matrix-vector multiplication的更多相关文章

ACM学习历程——UVA442 Matrix Chain Multiplication（栈）
Description Matrix Chain Multiplication Matrix Chain Multiplication Suppose you have to evaluate ...
Matrix Chain Multiplication[HDU1082]
Matrix Chain Multiplication Time Limit: 2000/1000 MS (Java/Others) Memory Limit: 65536/32768 K (J ...
UVA 442 二十 Matrix Chain Multiplication
Matrix Chain Multiplication Time Limit:3000MS Memory Limit:0KB 64bit IO Format:%lld & %l ...
UVa 442 Matrix Chain Multiplication(矩阵链,模拟栈)
意甲冠军由于矩阵乘法计算链表达的数量,需要的计算后的电流等于行的矩阵的矩阵的列数他们乘足够的人才非法输出error 输入是严格合法的即使仅仅有两个相乘也会用括号括起来并且括号中 ...
Matrix Chain Multiplication(表达式求值用栈操作）
题目链接:http://acm.hdu.edu.cn/showproblem.php?pid=1082 Matrix Chain Multiplication Time Limit: 2000/100 ...
UVA——442 Matrix Chain Multiplication
442 Matrix Chain MultiplicationSuppose you have to evaluate an expression like A*B*C*D*E where A,B,C ...
例题6-3 Matrix Chain Multiplication ，Uva 442
这个题思路没有任何问题,但还是做了近三个小时,其中2个多小时调试得到的经验有以下几点: 一定学会调试,掌握输出中间量的技巧,加强gdb调试的学习有时候代码不对,得到的结果却是对的(之后总结以下常见 ...
UVa442 Matrix Chain Multiplication
// UVa442 Matrix Chain Multiplication // 题意:输入n个矩阵的维度和一些矩阵链乘表达式,输出乘法的次数.假定A和m*n的,B是n*p的,那么AB是m*p的,乘法 ...
uva-442 Matrix Chain Multiplication
Suppose you have to evaluate an expression like A*B*C*D*E where A,B,C,D and E are matrices. Since ma ...

随机推荐

关于CoreData的用法
有些同事觉得CoreData是一个看不懂,理解不清的神秘东东,其实ios的本地数据储存是一个sqlite数据库,一个简易的数据库,而这个CoreData是否支持所有储存的数据呢,显然不是的,站在我的角 ...
pyton unittest
在说unittest之前,先说几个概念: TestCase 也就是测试用例 TestSuite 多个测试用例集合在一起,就是TestSuite TestLoader是用来加载TestCase到Test ...
Idea的pom文件导入依赖包仍然报错
(4)网络配置及CRT远程连接
修改linux虚拟机中某一网卡的网络配置: 打开终端,输入命令vi /etc/sysconfig/network-scripts/ifcfg-eth0 在文件中写入以下内容: (这里有个错误,DNS要 ...
java判断是否是数字
1.用JAVA自带的函数 public static boolean isNumeric(String str){ for (int i = 0; i < str.length(); i++){ ...
leetcode32
class Solution { public: int longestValidParentheses(string s) { ; stack<int> st; ; i < n; ...
int和Integer的自动拆箱/装箱相关问题
java中为没一种基本类型都提供相应的包装类型. byte,short,char,int,long,float,double和boolean Byte,Short,Character,Integer, ...
预浸料（Prepreg，PreimpregnatedMaterials）
预浸料(Prepreg,PreimpregnatedMaterials),是把基体(Matrix)浸渍在强化纤维(Reinforced Fiber)中制成的预浸片材产品,是复合材料的中间材料.
com.mysql.jdbc.Connection.isValid(I)Z
spring boot项目运行提示这个错误的时候只需要配置
我尼玛，二半夜的说中photo.src病毒了。
大半夜手机预警,中病毒了,我感觉也没啥东西呀.一个破小网站,别人黑我干啥. 登上服务器去一看,我滴个乖乖,photo.src病毒.服务器里面显示是一个背景桌面应用程序, 打算直接从文件夹删除,但是正在 ...

CUDA[4] sample program: matrix-vector multiplication

CUDA[4] sample program: matrix-vector multiplication的更多相关文章

随机推荐

热门专题