CUDA[4] sample program: matrix-vector multiplication
Use Compressed Sparse Row Format (CSR) to represent matrix
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "gputimer.h"
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#define WARP_SIZE 32 __global__ void
spmv_csr_vector_kernel ( const int num_rows ,
const int * ptr ,
const int * indices ,
const double * data ,
const double * x ,
double * y)
{
__shared__ double vals [WARP_SIZE];
int thread_id = blockDim.x * blockIdx.x + threadIdx.x ; // global thread index
int warp_id = thread_id / WARP_SIZE; // global warp index
int lane = thread_id & (WARP_SIZE - ); // thread index within the warp
// one warp per row
int row = warp_id ;
if ( row < num_rows )
{
int row_start = ptr [ row ];
int row_end = ptr [ row +];
// compute running sum per thread
vals [ threadIdx.x ] = ;
for ( int jj = row_start + lane ; jj < row_end ; jj += WARP_SIZE)
vals [ threadIdx.x ] += data [ jj ] * x [ indices [ jj ]];
// parallel reduction in shared memory
if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];
if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];
if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];
if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];
if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];
// first thread writes the result
if ( lane == )
y[ row ] += vals [ threadIdx.x ];
}
} __global__ void
spmv_csr_scalar_kernel ( const int num_rows ,
const int * ptr ,
const int * indices ,
const double * data ,
const double * x ,
double * y)
{
int row = blockDim.x * blockIdx.x + threadIdx.x ;
if( row < num_rows )
{
double dot = ;
int row_start = ptr [ row ];
int row_end = ptr [ row +];
for (int jj = row_start ; jj < row_end ; jj ++)
dot += data [ jj ] * x[ indices [ jj ]];
y[ row ] += dot ;
}
} int main(int argc,char **argv)
{
double h_data[]={,,,,,,,,};
int h_col[]={,,,,,,,,};
int h_ptr[]={,,,,};
double h_x[]={,,,,};
double h_y[]={,,,};
int num_rows=; double *d_data;
int *d_col;
int *d_ptr;
double *d_x;
double *d_y; cudaMalloc((void**) &d_data,sizeof(double)*);
cudaMalloc((void**) &d_col,sizeof(int)*);
cudaMalloc((void**) &d_ptr,sizeof(int)*);
cudaMalloc((void**) &d_x,sizeof(double)*);
cudaMalloc((void**) &d_y,sizeof(double)*);
cudaMemcpy((void*)d_data, (void*)h_data, sizeof(double)*, cudaMemcpyHostToDevice);
cudaMemcpy((void*)d_col, (void*)h_col, sizeof(int)*, cudaMemcpyHostToDevice);
cudaMemcpy((void*)d_ptr, (void*)h_ptr, sizeof(int)*, cudaMemcpyHostToDevice);
cudaMemcpy((void*)d_x, (void*)h_x, sizeof(double)*, cudaMemcpyHostToDevice);
cudaMemcpy((void*)d_y, (void*)h_y, sizeof(double)*, cudaMemcpyHostToDevice); GpuTimer timer;
timer.Start();
spmv_csr_vector_kernel<<<num_rows,>>>(num_rows,d_ptr,d_col,d_data,d_x,d_y);
//spmv_csr_scalar_kernel<<<1,32>>>(num_rows,d_ptr,d_col,d_data,d_x,d_y);
timer.Stop();
printf("Duration: %g ms\n",timer.Elapsed()); cudaMemcpy((void*)h_y, (void*)d_y, sizeof(double)*, cudaMemcpyDeviceToHost); for(int i=;i<num_rows;i++)
printf("%.5f ",h_y[i]);
printf("\n"); return ;
}
ref:
http://www.nvidia.com/docs/IO/66889/nvr-2008-004.pdf
ch4.3
CUDA[4] sample program: matrix-vector multiplication的更多相关文章
- ACM学习历程——UVA442 Matrix Chain Multiplication(栈)
Description Matrix Chain Multiplication Matrix Chain Multiplication Suppose you have to evaluate ...
- Matrix Chain Multiplication[HDU1082]
Matrix Chain Multiplication Time Limit: 2000/1000 MS (Java/Others) Memory Limit: 65536/32768 K (J ...
- UVA 442 二十 Matrix Chain Multiplication
Matrix Chain Multiplication Time Limit:3000MS Memory Limit:0KB 64bit IO Format:%lld & %l ...
- UVa 442 Matrix Chain Multiplication(矩阵链,模拟栈)
意甲冠军 由于矩阵乘法计算链表达的数量,需要的计算 后的电流等于行的矩阵的矩阵的列数 他们乘足够的人才 非法输出error 输入是严格合法的 即使仅仅有两个相乘也会用括号括起来 并且括号中 ...
- Matrix Chain Multiplication(表达式求值用栈操作)
题目链接:http://acm.hdu.edu.cn/showproblem.php?pid=1082 Matrix Chain Multiplication Time Limit: 2000/100 ...
- UVA——442 Matrix Chain Multiplication
442 Matrix Chain MultiplicationSuppose you have to evaluate an expression like A*B*C*D*E where A,B,C ...
- 例题6-3 Matrix Chain Multiplication ,Uva 442
这个题思路没有任何问题,但还是做了近三个小时,其中2个多小时调试 得到的经验有以下几点: 一定学会调试,掌握输出中间量的技巧,加强gdb调试的学习 有时候代码不对,得到的结果却是对的(之后总结以下常见 ...
- UVa442 Matrix Chain Multiplication
// UVa442 Matrix Chain Multiplication // 题意:输入n个矩阵的维度和一些矩阵链乘表达式,输出乘法的次数.假定A和m*n的,B是n*p的,那么AB是m*p的,乘法 ...
- uva-442 Matrix Chain Multiplication
Suppose you have to evaluate an expression like A*B*C*D*E where A,B,C,D and E are matrices. Since ma ...
随机推荐
- C语言数据结构基础学习笔记——C语言基础
抽象数据类型(ADT)是指一个数学模型以及定义在该模型上的一组操作,通常用(数据对象,数据关系,基本操作集)这样的三元组来表示抽象数据类型. 数据结构是相互之间存在一种或多种特定关系的数据元素的集合, ...
- 用PS做圆角图片
ps: Adobe Photoshop CS2 如果图片被锁定,请“双击”图层中“背景”解锁,如果没有图层菜单,在最上面导航栏中:窗口—图层.如下图: 点“确定”,解锁. 选用“圆角矩形工具”. ...
- Flume架构
Flume是Cloudera提供的一个高可用的,高可靠的,分布式的海量日志采集.聚合和传输的系统: Flume 介绍 Flume是由cloudera软件公司产出的高可用.高可靠.分布式的海量日志收集系 ...
- CSS笔试题
如何实现移动端9宫格 如何实现移动端下列8宫格 实现移动端图片画廊CSS样式 写一个简单的animation的css动画 美化select,radio,range样式,只需考虑谷歌浏览器,下面给出的是 ...
- Vue+webpack项目配置便于维护的目录结构
新建项目的时候创建合理的目录结构便于后期的维护是很重要 环境:vue.webpack 目录结构: 项目子目录结构 子目录结构都差不多,主要目录是在src下面操作 src目录结构 src/common ...
- H5兼容ie做法
IE浏览器支持新的标签:可以在使用新标签之前用javascript创建一个虚假的标签,这样就可以识别了.如<article>标签,在使用前,创建一个虚假的<article>标签 ...
- Python:笔记1_字符串处理【转载】
[转载自:https://www.cnblogs.com/houht/p/3308634.html] 1. 判断字符串str是否为空Approach 1:如果字符串长度为0,说明字符串为空,code如 ...
- java后台解析前端传来的json
@RequestMapping(value = {"save"}) @ResponseBody public Result save(TBaseInterventionPlan m ...
- java学习--异常
异常的概念 java异常是java提供的用于处理程序中错误的一种机制 所谓的错误是指在程序运行过程中发生的一些异常事件.如除0溢出,数组下标越界,文件不存在 设计良好的程序应该在异常发生时,提供处理这 ...
- cisco PBR
access-list 2000 permit ip 10.11.50.0 0.0.0.255 anyaccess-list 2001 permit ip 10.11.50.0 0.0.0.255 1 ...