0_Simple__cppIntegration

▶ 分离编译【留坑，在 Linux 上用命令行试一下】

▶ 源代码：

 // cppIntegration_gold.cpp

 #include <vector_types.h>

 extern "C" void computeGold(char *reference, char *idata, const unsigned int len);

 extern "C" void computeGold2(int2 *reference, int2 *idata, const unsigned int len);

 void computeGold(char *reference, char *idata, const unsigned int len)

 {

     for (unsigned int i = ; i < len; ++i)

         reference[i] = idata[i] - ;

 }

 void computeGold2(int2 *reference, int2 *idata, const unsigned int len)

 {

     for (unsigned int i = ; i < len; ++i)

     {

         reference[i].x = idata[i].x - idata[i].y;

         reference[i].y = idata[i].y;

     }

 }

 // cppIntegration.cu

 #include <stdlib.h>

 #include <stdio.h>

 #include <string.h>

 #include <math.h>

 #include <assert.h>

 #include <cuda_runtime.h>

 #include <helper_cuda.h>

 #include <helper_functions.h>

 #ifndef MAX

 #define MAX(a,b) (a > b ? a : b)

 #endif

 extern "C" void computeGold(char *reference, char *idata, const unsigned int len);

 extern "C" void computeGold2(int2 *reference, int2 *idata, const unsigned int len);

 // GPU上的运算

 __global__ void kernel(int *g_data)

 {

     const unsigned int tid = threadIdx.x;

     int data = g_data[tid];

     // data 每个字节的值减去 10，再拼接到一起

     g_data[tid] = ((((data <<  ) >> ) - ) << ) | ((((data <<  ) >> ) - ) << ) | ((((data << ) >> ) - ) <<  ) | ((((data << ) >> ) - ) <<  ) ;

 }

 __global__ void kernel2(int2 *g_data)       // 使用 int2* 格式的输入

 {

     const unsigned int tid = threadIdx.x;

     int2 data = g_data[tid];

     g_data[tid].x = data.x - data.y;        // data.x 中每个元素减去 data.y 中对应元素的偏移量

 }

 // 测试不同的核函数处理的结果。输入两种格式的待处理数据，及其长度

 extern "C" bool runTest(char *data, int2 *data_int2, unsigned int len)

 {

     assert((len % ) == );                                                     // 要求数组长度为 4 的倍数

     const unsigned int num_threads = len / , mem_size = sizeof(char) * len, mem_size_int2 = sizeof(int2) * len;

     char *d_data;

     cudaMalloc((void **)&d_data, mem_size);

     cudaMemcpy(d_data, data, mem_size, cudaMemcpyHostToDevice);

     int2 *d_data_int2;

     cudaMalloc((void **)&d_data_int2, mem_size_int2);

     cudaMemcpy(d_data_int2, data_int2, mem_size_int2, cudaMemcpyHostToDevice);

     kernel << < dim3(, , ), dim3(num_threads, , ) >> > ((int *)d_data);

     kernel2 << < dim3(, , ), dim3(len, , ) >> > (d_data_int2);

     getLastCudaError("Kernel execution failed");                                // 检查和函数运行是否有错误，有错则输出这话

     char *reference = (char *)malloc(mem_size);                                 // 使用 CPU 计算

     computeGold(reference, data, len);

     printf("ref char*:%s\n", reference);

     int2 *reference2 = (int2 *)malloc(mem_size_int2);

     computeGold2(reference2, data_int2, len);

     printf("ref int2 :");

     for (int i = ; i < len;i++)

         printf("%c", reference2[i].x);

     printf("\n");

     cudaMemcpy(data, d_data, mem_size, cudaMemcpyDeviceToHost);

     cudaMemcpy(data_int2, d_data_int2, mem_size_int2, cudaMemcpyDeviceToHost);

     cudaDeviceSynchronize();

     printf("gpu char*:%s\n", (char *)data);

     printf("gpu int2 :");

     for (int i = ; i < len; i++)

         printf("%c", data_int2[i].x);

     printf("\n");

     cudaFree(d_data);

     cudaFree(d_data_int2);

     free(reference);

     free(reference2);

     return ;

 }

 // main.cpp

 #include <iostream>

 #include <cstdlib>

 #include <cuda_runtime.h>

 #include <vector_types.h>

 #include <helper_cuda.h>

 extern "C" bool runTest(char *data, int2 *data_int2, unsigned int len);

 int main()

 {

     const int len = ;

     int2 i2[];                    // cuda 内置的 int2 类型

     char str[len] = { , , , ,, , , , , , , , , , ,  };

     for (int i = ; i < len; i++)

     {

         i2[i].x = str[i];

         i2[i].y = ;

     }

     runTest(str, i2, len);

     getchar();

     return ;

 }

● 输出结果：

ref char*: Hello World.

ref int2 :Hello World.

gpu char*: Hello World.

gpu int2 :Hello World.

▶ 涨姿势：

● cuda 内置的 int2 类型，整数有序对。涉及的定义如下：

 #define __cuda_builtin_vector_align8(tag, members)  \

     struct __device_builtin__ __align__() tag      \

     {                                               \

         members                                     \

     }

 __cuda_builtin_vector_align8(int2, int x; int y;);

 typedef __device_builtin__ struct int2 int2;

● 警告函数和错误检查函数

 #define assert(expression) (void)                                                                   \

 (                                                                                                   \

 (!!(expression)) || (_wassert(_CRT_WIDE(#expression), _CRT_WIDE(__FILE__), (unsigned)(__LINE__)), )\

 )

 #define getLastCudaError(msg) __getLastCudaError (msg, __FILE__, __LINE__)

 inline void __getLastCudaError(const char *errorMessage, const char *file, const int line)

 {

     cudaError_t err = cudaGetLastError();

     if (cudaSuccess != err)

     {

         fprintf(stderr, "%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n",

             file, line, errorMessage, (int)err, cudaGetErrorString(err));

         DEVICE_RESET

         exit(EXIT_FAILURE);

     }

 }

0_Simple__cppIntegration的更多相关文章

随机推荐

webpack2系列step1
第一篇:HTML 本文将一步一步的介绍webpack2的配置,从最基础的一直到与node结合. 操作都一样: midir step1 && cd step1 npm init -y n ...
数据库服务器构建和部署列表(For SQL Server 2012)
前言我们可能经常安装和部署数据库服务器,但是可能突然忘记了某个设置,为后来的运维造成隐患.下面是国外大牛整理的的检查列表. 其实也包含了很多我们平时数据库配置的最佳实践.比如TEMPDB 文件的个数 ...
计算机基础--Java中int char byte的关系
计算机基础--Java中int char byte的关系重要:一个汉字占用2byte,Java中用char(0-65535 Unicode16)型字符来存字(直接打印输出的话是字而非数字),当然要用 ...
Dynamic Inversions II 逆序数的性质树状数组求逆序数
Dynamic Inversions II Time Limit: 6000/3000MS (Java/Others) Memory Limit: 128000/64000KB (Java/Other ...
[USACO 3.3.1]骑马修栅栏t
[USACO 3.3.1]骑马修栅栏时间限制: 1 Sec 内存限制: 64 MB提交: 39 解决: 17[提交][状态][讨论版] 题目描述农民John每年有很多栅栏要修理.他总是骑着马穿 ...
php中常用的字符串大小写转换函数实例解释
PHP字符串处理函数中,最为简单的几个函数,相关解释就不上了,直接看例子. PHP字符串处理函数中,最为简单的几个函数,相关解释就不上了,直接看例子. strtolower函数.strtoupper函 ...
Windows中的硬链接和软链接（hard link 和 Symbolic link）
先来了解一下Linux中的硬链接和软链接: Linux中的硬链接和软链接 Windows中的硬链接和软链接: 硬链接从Windows NT4开始,NTFS文件系统引入了HardLink这个概念,它让 ...
try catch finally 中包含return的几种情况，及返回结果
当当当,兴致勃勃的第二篇博客,散花~ 下面是正题(敲黑板) 第一种情况:在try和catch中有return,finally中没有return,且finally中没有对try或catch中要 retu ...
ubuntu中运行python脚本
1. 运行方式一新建test.py文件: touch test.py 然后vim test.py打开并编辑: print 'Hello World' 打开终端,输入命令: python test.p ...
MSSQL查询数据分页
这几天刚好碰到数据的分页查询,觉得不错,Mark一下,方法有两种,都是使用select top,效率如何就不在这讨论方法1:利用select top配合not in(或者not exists),查询 ...

0_Simple__cppIntegration

0_Simple__cppIntegration的更多相关文章

随机推荐

热门专题