CUDA[4] sample program: matrix-vector multiplication
Use Compressed Sparse Row Format (CSR) to represent matrix
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "gputimer.h"
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#define WARP_SIZE 32 __global__ void
spmv_csr_vector_kernel ( const int num_rows ,
const int * ptr ,
const int * indices ,
const double * data ,
const double * x ,
double * y)
{
__shared__ double vals [WARP_SIZE];
int thread_id = blockDim.x * blockIdx.x + threadIdx.x ; // global thread index
int warp_id = thread_id / WARP_SIZE; // global warp index
int lane = thread_id & (WARP_SIZE - ); // thread index within the warp
// one warp per row
int row = warp_id ;
if ( row < num_rows )
{
int row_start = ptr [ row ];
int row_end = ptr [ row +];
// compute running sum per thread
vals [ threadIdx.x ] = ;
for ( int jj = row_start + lane ; jj < row_end ; jj += WARP_SIZE)
vals [ threadIdx.x ] += data [ jj ] * x [ indices [ jj ]];
// parallel reduction in shared memory
if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];
if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];
if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];
if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];
if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];
// first thread writes the result
if ( lane == )
y[ row ] += vals [ threadIdx.x ];
}
} __global__ void
spmv_csr_scalar_kernel ( const int num_rows ,
const int * ptr ,
const int * indices ,
const double * data ,
const double * x ,
double * y)
{
int row = blockDim.x * blockIdx.x + threadIdx.x ;
if( row < num_rows )
{
double dot = ;
int row_start = ptr [ row ];
int row_end = ptr [ row +];
for (int jj = row_start ; jj < row_end ; jj ++)
dot += data [ jj ] * x[ indices [ jj ]];
y[ row ] += dot ;
}
} int main(int argc,char **argv)
{
double h_data[]={,,,,,,,,};
int h_col[]={,,,,,,,,};
int h_ptr[]={,,,,};
double h_x[]={,,,,};
double h_y[]={,,,};
int num_rows=; double *d_data;
int *d_col;
int *d_ptr;
double *d_x;
double *d_y; cudaMalloc((void**) &d_data,sizeof(double)*);
cudaMalloc((void**) &d_col,sizeof(int)*);
cudaMalloc((void**) &d_ptr,sizeof(int)*);
cudaMalloc((void**) &d_x,sizeof(double)*);
cudaMalloc((void**) &d_y,sizeof(double)*);
cudaMemcpy((void*)d_data, (void*)h_data, sizeof(double)*, cudaMemcpyHostToDevice);
cudaMemcpy((void*)d_col, (void*)h_col, sizeof(int)*, cudaMemcpyHostToDevice);
cudaMemcpy((void*)d_ptr, (void*)h_ptr, sizeof(int)*, cudaMemcpyHostToDevice);
cudaMemcpy((void*)d_x, (void*)h_x, sizeof(double)*, cudaMemcpyHostToDevice);
cudaMemcpy((void*)d_y, (void*)h_y, sizeof(double)*, cudaMemcpyHostToDevice); GpuTimer timer;
timer.Start();
spmv_csr_vector_kernel<<<num_rows,>>>(num_rows,d_ptr,d_col,d_data,d_x,d_y);
//spmv_csr_scalar_kernel<<<1,32>>>(num_rows,d_ptr,d_col,d_data,d_x,d_y);
timer.Stop();
printf("Duration: %g ms\n",timer.Elapsed()); cudaMemcpy((void*)h_y, (void*)d_y, sizeof(double)*, cudaMemcpyDeviceToHost); for(int i=;i<num_rows;i++)
printf("%.5f ",h_y[i]);
printf("\n"); return ;
}
ref:
http://www.nvidia.com/docs/IO/66889/nvr-2008-004.pdf
ch4.3
CUDA[4] sample program: matrix-vector multiplication的更多相关文章
- ACM学习历程——UVA442 Matrix Chain Multiplication(栈)
Description Matrix Chain Multiplication Matrix Chain Multiplication Suppose you have to evaluate ...
- Matrix Chain Multiplication[HDU1082]
Matrix Chain Multiplication Time Limit: 2000/1000 MS (Java/Others) Memory Limit: 65536/32768 K (J ...
- UVA 442 二十 Matrix Chain Multiplication
Matrix Chain Multiplication Time Limit:3000MS Memory Limit:0KB 64bit IO Format:%lld & %l ...
- UVa 442 Matrix Chain Multiplication(矩阵链,模拟栈)
意甲冠军 由于矩阵乘法计算链表达的数量,需要的计算 后的电流等于行的矩阵的矩阵的列数 他们乘足够的人才 非法输出error 输入是严格合法的 即使仅仅有两个相乘也会用括号括起来 并且括号中 ...
- Matrix Chain Multiplication(表达式求值用栈操作)
题目链接:http://acm.hdu.edu.cn/showproblem.php?pid=1082 Matrix Chain Multiplication Time Limit: 2000/100 ...
- UVA——442 Matrix Chain Multiplication
442 Matrix Chain MultiplicationSuppose you have to evaluate an expression like A*B*C*D*E where A,B,C ...
- 例题6-3 Matrix Chain Multiplication ,Uva 442
这个题思路没有任何问题,但还是做了近三个小时,其中2个多小时调试 得到的经验有以下几点: 一定学会调试,掌握输出中间量的技巧,加强gdb调试的学习 有时候代码不对,得到的结果却是对的(之后总结以下常见 ...
- UVa442 Matrix Chain Multiplication
// UVa442 Matrix Chain Multiplication // 题意:输入n个矩阵的维度和一些矩阵链乘表达式,输出乘法的次数.假定A和m*n的,B是n*p的,那么AB是m*p的,乘法 ...
- uva-442 Matrix Chain Multiplication
Suppose you have to evaluate an expression like A*B*C*D*E where A,B,C,D and E are matrices. Since ma ...
随机推荐
- SAS PROC PRINT 常用选项和语句说明
常用选项1.使用选项OBS=修改观测序号标签2.使用NOOBS选项不显示观测序号列3.使用ID语句在输出中取代观测序号列4.使用VAR选择输出的变量5.使用WHERE语句选择输出的观测6.使用数据集选 ...
- linux服务samba与ftp篇
samba复习: 1.下载samba:yum -y install samba 2.打开配置文件/etc/samba/smb.conf输入: [共享文件名] path = 目录名 (事先创建) pub ...
- 浅析负载均衡的6种算法,Ngnix的5种算法。
浅析负载均衡的6种算法,Ngnix的5种算法.浮生偷闲百家号03-21 10:06关注内容导读其实际效果越来越接近于平均分配调用量到后端的每一台服务器,也就是轮询的结果.源地址哈希的思想是根据获取客 ...
- TCP的三次握手与四次挥手理解及面试题(很全面)
序列号seq:占4个字节,用来标记数据段的顺序,TCP把连接中发送的所有数据字节都编上一个序号,第一个字节的编号由本地随机产生:给字节编上序号后,就给每一个报文段指派一个序号:序列号seq就是这个报文 ...
- redis-单线程架构
单线程模型: redis中的数据结构并不全是简单的kv,还有list.hash等复杂的结构,这些结构很可能会进行细粒度的操作,比如在很长的列表偶棉添加一个元素,在hash当中或者删除一个对象,这样的一 ...
- Python协程、异步IO
本节内容 Gevent协程 Select\Poll\Epoll异步IO与事件驱动 Python连接Mysql数据库操作 RabbitMQ队列 Redis\Memcached缓存 Paramiko SS ...
- 什么是python的全局解释锁(GIL)
GIL解决了Python中的什么问题? 为什么选取GIL作为解决方案? 对多线程Python程序的影响 为什么GIL还没有被删除? 为什么在Python 3 中GIL没有被移除? 如何处理Python ...
- python_06 函数、全局变量与局部变量、函数递归
函数 1.函数的定义: def 函数名(参数): #解释函数的功能 代码块 返回值 函数的定义主要有如下要点: def:表示函数的关键字 函数名:函数的名称,日后根据函数名调用函数 函数体:函数中进行 ...
- 阿里云ODPS <====>蚂蚁大数据
1.命令行客户端工具的安装参考文档:http://repo.aliyun.com/odpscmd/?spm=a2c4g.11186623.2.17.5c185c23zHshCq 2.创建和查看表:ht ...
- 退出vim编辑器(转)
在linux家族中,vim编辑器是系统自带的文本编辑器,其功能强大自不必说了. 偶有小白,刚接触linux,要修改某个文本文件,不可能像WINDOWS那样操作,更有甚者,进入VI编辑器后,无法退出以致 ...