Use Compressed Sparse Row Format (CSR) to represent matrix

 #include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "gputimer.h"
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#define WARP_SIZE 32 __global__ void
spmv_csr_vector_kernel ( const int num_rows ,
const int * ptr ,
const int * indices ,
const double * data ,
const double * x ,
double * y)
{
__shared__ double vals [WARP_SIZE];
int thread_id = blockDim.x * blockIdx.x + threadIdx.x ; // global thread index
int warp_id = thread_id / WARP_SIZE; // global warp index
int lane = thread_id & (WARP_SIZE - ); // thread index within the warp
// one warp per row
int row = warp_id ;
if ( row < num_rows )
{
int row_start = ptr [ row ];
int row_end = ptr [ row +];
// compute running sum per thread
vals [ threadIdx.x ] = ;
for ( int jj = row_start + lane ; jj < row_end ; jj += WARP_SIZE)
vals [ threadIdx.x ] += data [ jj ] * x [ indices [ jj ]];
// parallel reduction in shared memory
if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];
if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];
if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];
if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];
if ( lane < ) vals [ threadIdx.x ] += vals [ threadIdx.x + ];
// first thread writes the result
if ( lane == )
y[ row ] += vals [ threadIdx.x ];
}
} __global__ void
spmv_csr_scalar_kernel ( const int num_rows ,
const int * ptr ,
const int * indices ,
const double * data ,
const double * x ,
double * y)
{
int row = blockDim.x * blockIdx.x + threadIdx.x ;
if( row < num_rows )
{
double dot = ;
int row_start = ptr [ row ];
int row_end = ptr [ row +];
for (int jj = row_start ; jj < row_end ; jj ++)
dot += data [ jj ] * x[ indices [ jj ]];
y[ row ] += dot ;
}
} int main(int argc,char **argv)
{
double h_data[]={,,,,,,,,};
int h_col[]={,,,,,,,,};
int h_ptr[]={,,,,};
double h_x[]={,,,,};
double h_y[]={,,,};
int num_rows=; double *d_data;
int *d_col;
int *d_ptr;
double *d_x;
double *d_y; cudaMalloc((void**) &d_data,sizeof(double)*);
cudaMalloc((void**) &d_col,sizeof(int)*);
cudaMalloc((void**) &d_ptr,sizeof(int)*);
cudaMalloc((void**) &d_x,sizeof(double)*);
cudaMalloc((void**) &d_y,sizeof(double)*);
cudaMemcpy((void*)d_data, (void*)h_data, sizeof(double)*, cudaMemcpyHostToDevice);
cudaMemcpy((void*)d_col, (void*)h_col, sizeof(int)*, cudaMemcpyHostToDevice);
cudaMemcpy((void*)d_ptr, (void*)h_ptr, sizeof(int)*, cudaMemcpyHostToDevice);
cudaMemcpy((void*)d_x, (void*)h_x, sizeof(double)*, cudaMemcpyHostToDevice);
cudaMemcpy((void*)d_y, (void*)h_y, sizeof(double)*, cudaMemcpyHostToDevice); GpuTimer timer;
timer.Start();
spmv_csr_vector_kernel<<<num_rows,>>>(num_rows,d_ptr,d_col,d_data,d_x,d_y);
//spmv_csr_scalar_kernel<<<1,32>>>(num_rows,d_ptr,d_col,d_data,d_x,d_y);
timer.Stop();
printf("Duration: %g ms\n",timer.Elapsed()); cudaMemcpy((void*)h_y, (void*)d_y, sizeof(double)*, cudaMemcpyDeviceToHost); for(int i=;i<num_rows;i++)
printf("%.5f ",h_y[i]);
printf("\n"); return ;
}

ref:

http://www.nvidia.com/docs/IO/66889/nvr-2008-004.pdf  

ch4.3

CUDA[4] sample program: matrix-vector multiplication的更多相关文章

  1. ACM学习历程——UVA442 Matrix Chain Multiplication(栈)

    Description   Matrix Chain Multiplication  Matrix Chain Multiplication  Suppose you have to evaluate ...

  2. Matrix Chain Multiplication[HDU1082]

    Matrix Chain Multiplication Time Limit: 2000/1000 MS (Java/Others)    Memory Limit: 65536/32768 K (J ...

  3. UVA 442 二十 Matrix Chain Multiplication

    Matrix Chain Multiplication Time Limit:3000MS     Memory Limit:0KB     64bit IO Format:%lld & %l ...

  4. UVa 442 Matrix Chain Multiplication(矩阵链,模拟栈)

    意甲冠军  由于矩阵乘法计算链表达的数量,需要的计算  后的电流等于行的矩阵的矩阵的列数  他们乘足够的人才  非法输出error 输入是严格合法的  即使仅仅有两个相乘也会用括号括起来  并且括号中 ...

  5. Matrix Chain Multiplication(表达式求值用栈操作)

    题目链接:http://acm.hdu.edu.cn/showproblem.php?pid=1082 Matrix Chain Multiplication Time Limit: 2000/100 ...

  6. UVA——442 Matrix Chain Multiplication

    442 Matrix Chain MultiplicationSuppose you have to evaluate an expression like A*B*C*D*E where A,B,C ...

  7. 例题6-3 Matrix Chain Multiplication ,Uva 442

    这个题思路没有任何问题,但还是做了近三个小时,其中2个多小时调试 得到的经验有以下几点: 一定学会调试,掌握输出中间量的技巧,加强gdb调试的学习 有时候代码不对,得到的结果却是对的(之后总结以下常见 ...

  8. UVa442 Matrix Chain Multiplication

    // UVa442 Matrix Chain Multiplication // 题意:输入n个矩阵的维度和一些矩阵链乘表达式,输出乘法的次数.假定A和m*n的,B是n*p的,那么AB是m*p的,乘法 ...

  9. uva-442 Matrix Chain Multiplication

    Suppose you have to evaluate an expression like A*B*C*D*E where A,B,C,D and E are matrices. Since ma ...

随机推荐

  1. 清除eclipse,STS workspace历史记录

    记一下 打开eclipse下的/configuration/.settings目录 修改文件org.eclipse.ui.ide.prefs文件 把RECENT_WORKSPACES这项修改为你需要的 ...

  2. python tp_ready函数分析

    int PyType_Ready(PyTypeObject *type) { PyObject *dict, *bases; PyTypeObject *base; Py_ssize_t i, n; ...

  3. JavaScript学习-3——数组、函数、递归

    本章目录 -----------①数组 -----------②函数 -----------③递归 一.数组 弱类型:任何类型数据,且没有强度限制: 强类型:同一类型的数据存储的集合(内存中连续存储) ...

  4. Object.defineProperty(o,p,descriptor ) 理解应用

    1. Object.defineProperty  在一个对象上定义一个新属性,或修改一个已经存在的属性, 最终返回这个对象. var __define = this.__define || func ...

  5. java.util.HashSet, java.util.LinkedHashMap, java.util.IdentityHashMap 源码阅读 (JDK 1.8.0_111)

    一.java.util.HashSet 1.1 HashSet集成结构 1.2 java.util.HashSet属性 private transient HashMap<E,Object> ...

  6. 简单定时器的Java实现

    这两个类使用起来非常方便,可以完成我们对定时器的绝大多数需求 Timer类是用来执行任务的类,它接受一个TimerTask做参数 Timer有两种执行任务的模式,最常用的是schedule,它可以以两 ...

  7. AD+DMA+USART实验中的收获和总结

    由于实验室用的是USART3接口,但是在基地实验时,由于没有RS232,只能换到USART1,进行实验.(在交作业的时候,记得要再换回去) 在这个过程中,遇到困难,用串口软件发送数据时无响应,应该意味 ...

  8. java实现两个不同list对象合并后并排序

    工作上遇到一个要求两个不同list对象合并后并排序1.问题描述从数据库中查询两张表的当天数据,并对这两张表的数据,进行合并,然后根据时间排序.2.思路从数据库中查询到的数据放到各自list中,先遍历两 ...

  9. spring boot 整合 elasticsearch 5.x

    spring boot与elasticsearch集成有两种方式.一种是直接使用elasticsearch.一种是使用data中间件. 本文只指针使用maven集成elasticsearch 5.x, ...

  10. python list初始化技巧

    一维列表 # 初始化递增的list,与L = [i for i in range(10)] 效果相同 L = range(10) # print(L) # [0,1,2,3,4,5,6,7,8,9] ...