《GPU高性能编程CUDA实战》附录二 散列表
▶ 使用CPU和GPU分别实现散列表
● CPU方法
#include <stdio.h>
#include <time.h>
#include "cuda_runtime.h"
#include "D:\Code\CUDA\book\common\book.h" #define SIZE (100*1024*1024)
#define ELEMENTS (SIZE / sizeof(unsigned int))
#define HASH_ENTRIES (1024) struct Entry
{
unsigned int key;
void *value;
Entry *next;
}; struct Table
{
size_t count;
Entry **entries;
Entry *pool;
Entry *firstFree;
}; size_t hash(unsigned int key, size_t count)
{
return key % count;
} void initialize_table(Table &table, int entries, int elements)
{
table.count = entries;
table.entries = (Entry**)calloc(entries, sizeof(Entry*));
table.pool = (Entry*)malloc(elements * sizeof(Entry));
table.firstFree = table.pool;
} void free_table(Table &table)
{
free(table.entries);
free(table.pool);
} void add_to_table(Table &table, unsigned int key, void *value)
{
size_t hashValue = hash(key, table.count);
Entry *location = table.firstFree++;
location->key = key;
location->value = value;
location->next = table.entries[hashValue];// 插到该分支的头部而不是尾部
table.entries[hashValue] = location;
} void verify_table(const Table &table)
{
int count = ;
for (size_t i = ; i<table.count; i++)
{
Entry *current = table.entries[i];
while (current != NULL)
{
++count;
if (hash(current->key, table.count) != i)
printf("\n\t%d hashed to %ld, but was located at %ld\n", current->key, hash(current->key, table.count), i);
current = current->next;
}
}
if (count != ELEMENTS)
printf("\n\t%d elements found in hash table. Should be %ld\n",
count, ELEMENTS);
else
printf("\n\tAll %d elements found in hash table.\n", count);
} int main(void)
{
unsigned int *buffer =(unsigned int*)big_random_block(SIZE);
Table table;
clock_t start, stop; initialize_table(table, HASH_ENTRIES, ELEMENTS); start = clock();
for (int i = ; i<ELEMENTS; i++)
add_to_table(table, buffer[i], (void*)NULL); stop = clock();
printf("\n\tBuilding the table: %3.1f ms\n", (float)(stop - start) / (float)CLOCKS_PER_SEC * 1000.0f); verify_table(table);
free_table(table);
free(buffer);
getchar();
return ;
}
● GPU方法(用到了前面的原子锁)
#include <stdio.h>
#include <time.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda.h"
#include "D:\Code\CUDA\book\common\book.h" #define SIZE (100*1024*1024)
#define ELEMENTS (SIZE / sizeof(unsigned int))
#define HASH_ENTRIES (1024) struct Lock
{
int *mutex;
Lock(void)
{
int state = ;
cudaMalloc((void **)&mutex, sizeof(int));
cudaMemcpy(mutex, &state, sizeof(int), cudaMemcpyHostToDevice);
}
~Lock(void)
{
cudaFree(mutex);
}
__device__ void lock(void)
{
while (atomicCAS(mutex, , ) != );
}
__device__ void unlock(void)
{
atomicExch(mutex, );
}
}; struct Entry
{
unsigned int key;
void *value;
Entry *next;
}; struct Table
{
size_t count;
Entry **entries;
Entry *pool;
Entry *firstFree;
}; __device__ __host__ size_t hash(unsigned int key, size_t count)
{
return key % count;
} void initialize_table(Table &table, int entries, int elements)
{
table.count = entries;
cudaMalloc((void**)&table.entries, entries * sizeof(Entry*));
cudaMemset(table.entries, , entries * sizeof(Entry*));
cudaMalloc((void**)&table.pool, elements * sizeof(Entry));
} void free_table(Table &table)
{
cudaFree(table.entries);
cudaFree(table.pool);
} __global__ void add_to_table(unsigned int *keys, void **values, Table table, Lock *lock)
// 锁数组用于锁定散列表中的每一个桶
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
while (tid < ELEMENTS)
{
unsigned int key = keys[tid];
size_t hashValue = hash(key, table.count);
for (int i = ; i<; i++)// 利用循环来分散线程束,使同一线程束中的32个线程在循环的不同次数时进行写入
{
if ((tid % ) == i)
{
Entry *location = &(table.pool[tid]);
location->key = key;
location->value = values[tid];
lock[hashValue].lock();
location->next = table.entries[hashValue];
table.entries[hashValue] = location;
lock[hashValue].unlock();
}
}
tid += stride;
}
} void copy_table_to_host(const Table &table, Table &hostTable)
{
hostTable.count = table.count;
hostTable.entries = (Entry**)calloc(table.count, sizeof(Entry*));
hostTable.pool = (Entry*)malloc(ELEMENTS * sizeof(Entry)); cudaMemcpy(hostTable.entries, table.entries, table.count * sizeof(Entry*), cudaMemcpyDeviceToHost);
cudaMemcpy(hostTable.pool, table.pool, ELEMENTS * sizeof(Entry), cudaMemcpyDeviceToHost); for (int i = ; i < table.count; i++)
{
if (hostTable.entries[i] != NULL)
hostTable.entries[i] = (Entry*)((size_t)hostTable.entries[i] - (size_t)table.pool + (size_t)hostTable.pool);
// 从从显存到内存的地址线性偏移 x - adressGPU + addressCPU
}
for (int i = ; i < ELEMENTS; i++)
{
if (hostTable.pool[i].next != NULL)
hostTable.pool[i].next = (Entry*)((size_t)hostTable.pool[i].next - (size_t)table.pool + (size_t)hostTable.pool);
// 同样是做偏移,但是要找到下一个元素的地址
}
} void verify_table(const Table &dev_table)
{
Table table;
copy_table_to_host(dev_table, table); int count = ;
for (size_t i = ; i < table.count; i++)
{
Entry *current = table.entries[i];
while (current != NULL)
{
++count;
if (hash(current->key, table.count) != i)
printf("%d hashed to %ld, but was located at %ld\n", current->key, hash(current->key, table.count), i);
current = current->next;
}
}
if (count != ELEMENTS)
printf("%d elements found in hash table. Should be %ld\n", count, ELEMENTS);
else
printf("All %d elements found in hash table.\n", count);
} int main(void)
{
unsigned int *buffer = (unsigned int*)big_random_block(SIZE); unsigned int *dev_keys;
void **dev_values;
cudaMalloc((void**)&dev_keys, SIZE);
cudaMalloc((void**)&dev_values, SIZE);
cudaMemcpy(dev_keys, buffer, SIZE, cudaMemcpyHostToDevice); Table table;
initialize_table(table, HASH_ENTRIES, ELEMENTS); Lock lock[HASH_ENTRIES];// 准备锁列表
Lock *dev_lock;
cudaMalloc((void**)&dev_lock, HASH_ENTRIES * sizeof(Lock));
cudaMemcpy(dev_lock, lock, HASH_ENTRIES * sizeof(Lock), cudaMemcpyHostToDevice); cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, ); add_to_table << <, >> >(dev_keys, dev_values, table, dev_lock); cudaEventRecord(stop, );
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Time to hash: %3.1f ms\n", elapsedTime); verify_table(table);
free_table(table); cudaEventDestroy(start);
cudaEventDestroy(stop);
free_table(table);
cudaFree(dev_lock);
cudaFree(dev_keys);
cudaFree(dev_values);
free(buffer);
getchar();
return ;
}
《GPU高性能编程CUDA实战》附录二 散列表的更多相关文章
- [问题解决]《GPU高性能编程CUDA实战》中第4章Julia实例“显示器驱动已停止响应,并且已恢复”问题的解决方法
以下问题的出现及解决都基于"WIN7+CUDA7.5". 问题描述:当我编译运行<GPU高性能编程CUDA实战>中第4章所给Julia实例代码时,出现了显示器闪动的现象 ...
- 《GPU高性能编程CUDA实战》附录四 其他头文件
▶ cpu_bitmap.h #ifndef __CPU_BITMAP_H__ #define __CPU_BITMAP_H__ #include "gl_helper.h" st ...
- 《GPU高性能编程CUDA实战》附录一 高级原子操作
▶ 本章介绍了手动实现原子操作.重构了第五章向量点积的过程.核心是通过定义结构Lock及其运算,实现锁定,读写,解锁的过程. ● 章节代码 #include <stdio.h> #incl ...
- 《GPU高性能编程CUDA实战》附录三 关于book.h
▶ 本书中用到的公用函数放到了头文件book.h中 #ifndef __BOOK_H__ #define __BOOK_H__ #include <stdio.h> #include &l ...
- 《GPU高性能编程CUDA实战》第五章 线程并行
▶ 本章介绍了线程并行,并给出四个例子.长向量加法.波纹效果.点积和显示位图. ● 长向量加法(线程块并行 + 线程并行) #include <stdio.h> #include &quo ...
- 《GPU高性能编程CUDA实战》第十一章 多GPU系统的CUDA C
▶ 本章介绍了多设备胸膛下的 CUDA 编程,以及一些特殊存储类型对计算速度的影响 ● 显存和零拷贝内存的拷贝与计算对比 #include <stdio.h> #include " ...
- 《GPU高性能编程CUDA实战》第七章 纹理内存
▶ 本章介绍了纹理内存的使用,并给出了热传导的两个个例子.分别使用了一维和二维纹理单元. ● 热传导(使用一维纹理) #include <stdio.h> #include "c ...
- 《GPU高性能编程CUDA实战》第四章 简单的线程块并行
▶ 本章介绍了线程块并行,并给出两个例子:长向量加法和绘制julia集. ● 长向量加法,中规中矩的GPU加法,包含申请内存和显存,赋值,显存传入,计算,显存传出,处理结果,清理内存和显存.用到了 t ...
- 《GPU高性能编程CUDA实战》第八章 图形互操作性
▶ OpenGL与DirectX,等待填坑. ● basic_interop #include <stdio.h> #include "cuda_runtime.h" ...
随机推荐
- NASSA’s Robot
NASSA的机器人降落到了火星,降落的地方可以用X-Y坐标表示.机器人最开始在(0, 0).由于传输问题,部分指令可能会混淆,现在给出确定的命令与未知命令,请帮忙确认机器人的X.Y坐标最小最大值分别是 ...
- CountDownLatch的简单理解
CountDownLatch的概念 CountDownLatch是一个同步工具类,用来协调多个线程之间的同步,或者说起到线程之间的通信(而不是用作互斥的作用). CountDownLatch能够使一个 ...
- 【多线程学习笔记整理】002_线程的停止、暂停、与yield
一.停止线程的三种方式 停止线程是多线程中的一个很重要的点,停止线程意味着在线程处理完当前任务之前终止正在做的操作,但是如果不能正确的操作,可能会发生不可预期的结果. 1)使用退出标志,使线程正常退出 ...
- Mysql ON子句和USING子句
Mysql ON子句和USING子句 Mysql 中联接SQL语句中,ON子句的语法格式为:table1.column_name = table2.column_name. 当模式设计对联接表的列 ...
- socket、WebSocket
WebSocket 协议本质上是一个基于TCP的协议,它由通信协议和编程API组成,WebSocket能够在浏览器和服务器之间建立双向连接,以基于事件的方式,赋予浏览器实时通信能力. socket本质 ...
- mysql复制表结构create table as和like的区别
对于MySQL的复制相同表结构方法,有create table as 和create table like 两种,区别是什么呢? create table t2 as select * from t1 ...
- JVM内存模型(一)
主要澄清之前对JVM内存模型的一些误区: JMV内存主要分为5块:方法区(Method Area),堆区(Heap),虚拟机栈(VM stack),本地方法栈(Native Method stack) ...
- hadoop框架结构介绍
近年,随着互联网的发展特别是移动互联网的发展,数据的增长呈现出一种爆炸式的成长势头.单是谷歌的爬虫程序每天下载的网页超过1亿个(2000年数据,)数据的爆炸式增长直接推动了海量数据处理技术的发展.谷歌 ...
- PHP 小技巧之__callStatic魔术方法使用
使用 PHP 框架时,经常会用到 ORM 模型查询数据库,有没有疑问:为啥有些 ORM 中的静态查询方法,不能通过函数追踪下去呢,很有可能就是使用了 __callStatic 魔术方法的小技巧 这里贴 ...
- 【python】常用的一些内置函数
1.cmp cmp(A,B)函数,比较A,B的大小,如果A大于B,返回1,A小于B返回-1,A等于B返回0 print cmp(12,33) >>>-1 print cmp(&quo ...