参考:

https://www.cnblogs.com/devilmaycry812839668/p/15348610.html

最近在看WarpDrive的代码,其中cuda上运行的代码是使用pycuda库进行连通的,使用pycuda可以很好的在python环境中调用cuda的代码,但是在使用中发现一些事情,那就是cuda函数的初始化要放在cuda内存空间初始化之后,否则会有报错。

代码:(可以正常运行的代码)

import numpy as np

from warp_drive.managers.data_manager import CUDADataManager
from warp_drive.managers.function_manager import (
CUDAFunctionManager, CUDALogController, CUDASampler, CUDAEnvironmentReset
)
from warp_drive.utils.data_feed import DataFeed source_code = """
// A function to demonstrate how to manipulate data on the GPU.
// This function increments each the random data array we pushed to the GPU before.
// Each index corresponding to (env_id, agent_id) in the array is incremented by "agent_id + env_id".
// Everything inside the if() loop runs in parallel for each agent and environment.
//
extern "C"{
__global__ void cuda_increment(
float* data,
int num_agents
)
{
int env_id = blockIdx.x;
int agent_id = threadIdx.x;
if (agent_id < num_agents){
int array_index = env_id * num_agents + agent_id;
int increment = env_id + agent_id;
data[array_index] += increment;
}
}
}
""" from timeit import Timer def push_random_data_and_increment_timer(
num_runs=1,
num_envs=2,
num_agents=3,
source_code=None
): assert source_code is not None def push_random_data(num_agents, num_envs):
# Initialize the CUDA data manager
cuda_data_manager = CUDADataManager(
num_agents=num_agents,
num_envs=num_envs,
episode_length=100
) # Create random data
random_data = np.random.rand(num_envs, num_agents) # Push data from host to device
data_feed = DataFeed()
data_feed.add_data(
name="random_data",
data=random_data,
)
data_feed.add_data(
name="num_agents",
data=num_agents
)
cuda_data_manager.push_data_to_device(data_feed) return cuda_data_manager # Initialize the CUDA function manager
def cuda_func_init():
cuda_function_manager = CUDAFunctionManager(
num_agents=num_agents, #cuda_data_manager.meta_info("n_agents"),
num_envs=num_envs #cuda_data_manager.meta_info("n_envs")
) # Load source code and initialize function
cuda_function_manager.load_cuda_from_source_code(
source_code,
default_functions_included=False
)
cuda_function_manager.initialize_functions(["cuda_increment"])
increment_function = cuda_function_manager._get_function("cuda_increment") return cuda_function_manager, increment_function def increment_data(cuda_data_manager, cuda_function_manager, increment_function):
increment_function(
cuda_data_manager.device_data("random_data"),
cuda_data_manager.device_data("num_agents"),
block=cuda_function_manager.block,
grid=cuda_function_manager.grid
) # set variable
# cuda_data_manager = push_random_data(num_agents, num_envs) # cuda function init
# cuda_function_manager, increment_function = cuda_func_init() # cuda function run
# increment_data(cuda_data_manager, cuda_function_manager, increment_function) data_push_time = Timer(lambda: push_random_data(num_agents, num_envs)).timeit(number=num_runs) cuda_data_manager = push_random_data(num_agents, num_envs)
cuda_function_manager, increment_function = cuda_func_init()
program_run_time = Timer(lambda: increment_data(cuda_data_manager, cuda_function_manager, increment_function)).timeit(number=num_runs)
print(cuda_data_manager.pull_data_from_device('random_data')) return {
"data push times": data_push_time,
"code run time": program_run_time
} num_runs = 1000
times = {} for scenario in [
(1, 1),
(1, 100),
(1, 1000),
(100, 1000),
(1000, 1000)
]:
num_envs, num_agents = scenario
times.update(
{
f"envs={num_envs}, agents={num_agents}":
push_random_data_and_increment_timer(
num_runs,
num_envs,
num_agents,
source_code
)
}
) print(f"Times for {num_runs} function calls")
print("*"*40)
for key, value in times.items():
print(f"{key:30}: mean data push times: {value['data push times']:10.5}s,\t mean increment times: {value['code run time']:10.5}s") '''
print(cuda_data_manager._meta_info)
print(cuda_data_manager._host_data)
print(cuda_data_manager._device_data_pointer)
print(cuda_data_manager._scalar_data_list)
print(cuda_data_manager._reset_data_list)
print(cuda_data_manager._log_data_list)
print(cuda_data_manager._device_data_via_torch)
print(cuda_data_manager._shared_constants)
print(cuda_data_manager._shape)
print(cuda_data_manager._dtype) print(tensor_on_device)
time.sleep(300) '''

报错的代码:

import numpy as np

from warp_drive.managers.data_manager import CUDADataManager
from warp_drive.managers.function_manager import (
CUDAFunctionManager, CUDALogController, CUDASampler, CUDAEnvironmentReset
)
from warp_drive.utils.data_feed import DataFeed source_code = """
// A function to demonstrate how to manipulate data on the GPU.
// This function increments each the random data array we pushed to the GPU before.
// Each index corresponding to (env_id, agent_id) in the array is incremented by "agent_id + env_id".
// Everything inside the if() loop runs in parallel for each agent and environment.
//
extern "C"{
__global__ void cuda_increment(
float* data,
int num_agents
)
{
int env_id = blockIdx.x;
int agent_id = threadIdx.x;
if (agent_id < num_agents){
int array_index = env_id * num_agents + agent_id;
int increment = env_id + agent_id;
data[array_index] += increment;
}
}
}
""" from timeit import Timer def push_random_data_and_increment_timer(
num_runs=1,
num_envs=2,
num_agents=3,
source_code=None
): assert source_code is not None def push_random_data(num_agents, num_envs):
# Initialize the CUDA data manager
cuda_data_manager = CUDADataManager(
num_agents=num_agents,
num_envs=num_envs,
episode_length=100
) # Create random data
random_data = np.random.rand(num_envs, num_agents) # Push data from host to device
data_feed = DataFeed()
data_feed.add_data(
name="random_data",
data=random_data,
)
data_feed.add_data(
name="num_agents",
data=num_agents
)
cuda_data_manager.push_data_to_device(data_feed) return cuda_data_manager # Initialize the CUDA function manager
def cuda_func_init():
cuda_function_manager = CUDAFunctionManager(
num_agents=num_agents, #cuda_data_manager.meta_info("n_agents"),
num_envs=num_envs #cuda_data_manager.meta_info("n_envs")
) # Load source code and initialize function
cuda_function_manager.load_cuda_from_source_code(
source_code,
default_functions_included=False
)
cuda_function_manager.initialize_functions(["cuda_increment"])
increment_function = cuda_function_manager._get_function("cuda_increment") return cuda_function_manager, increment_function def increment_data(cuda_data_manager, cuda_function_manager, increment_function):
increment_function(
cuda_data_manager.device_data("random_data"),
cuda_data_manager.device_data("num_agents"),
block=cuda_function_manager.block,
grid=cuda_function_manager.grid
) # set variable
# cuda_data_manager = push_random_data(num_agents, num_envs) # cuda function init
# cuda_function_manager, increment_function = cuda_func_init() # cuda function run
# increment_data(cuda_data_manager, cuda_function_manager, increment_function) #data_push_time = Timer(lambda: push_random_data(num_agents, num_envs)).timeit(number=num_runs) cuda_function_manager, increment_function = cuda_func_init() ###
cuda_data_manager = push_random_data(num_agents, num_envs) ###
program_run_time = Timer(lambda: increment_data(cuda_data_manager, cuda_function_manager, increment_function)).timeit(number=num_runs)
print(cuda_data_manager.pull_data_from_device('random_data')) return {
"data push times": 0, #data_push_time,
"code run time": program_run_time
} num_runs = 1000
times = {} for scenario in [
(1, 1),
(1, 100),
(1, 1000),
(100, 1000),
(1000, 1000)
]:
num_envs, num_agents = scenario
times.update(
{
f"envs={num_envs}, agents={num_agents}":
push_random_data_and_increment_timer(
num_runs,
num_envs,
num_agents,
source_code
)
}
) print(f"Times for {num_runs} function calls")
print("*"*40)
for key, value in times.items():
print(f"{key:30}: mean data push times: {value['data push times']:10.5}s,\t mean increment times: {value['code run time']:10.5}s") '''
print(cuda_data_manager._meta_info)
print(cuda_data_manager._host_data)
print(cuda_data_manager._device_data_pointer)
print(cuda_data_manager._scalar_data_list)
print(cuda_data_manager._reset_data_list)
print(cuda_data_manager._log_data_list)
print(cuda_data_manager._device_data_via_torch)
print(cuda_data_manager._shared_constants)
print(cuda_data_manager._shape)
print(cuda_data_manager._dtype) print(tensor_on_device)
time.sleep(300) '''

报错信息:

Traceback (most recent call last):
File "/home/xxxxxx/warp-drive/devil_make/tutorial-1-warp_drive_basics.py", line 145, in <module>
source_code
File "/home/xxxxxx/warp-drive/devil_make/tutorial-1-warp_drive_basics.py", line 116, in push_random_data_and_increment_timer
program_run_time = Timer(lambda: increment_data(cuda_data_manager, cuda_function_manager, increment_function)).timeit(number=num_runs)
File "/home/xxxxxx/anaconda3/envs/warp_drive/lib/python3.7/timeit.py", line 177, in timeit
timing = self.inner(it, self.timer)
File "<timeit-src>", line 6, in inner
File "/home/xxxxxx/warp-drive/devil_make/tutorial-1-warp_drive_basics.py", line 116, in <lambda>
program_run_time = Timer(lambda: increment_data(cuda_data_manager, cuda_function_manager, increment_function)).timeit(number=num_runs)
File "/home/xxxxxx/warp-drive/devil_make/tutorial-1-warp_drive_basics.py", line 97, in increment_data
grid=cuda_function_manager.grid
File "/home/xxxxxx/anaconda3/envs/warp_drive/lib/python3.7/site-packages/pycuda/driver.py", line 480, in function_call
func._set_block_shape(*block)
pycuda._driver.LogicError: cuFuncSetBlockShape failed: invalid resource handle

由此可知,在使用pycuda时,如果cuda函数初始化之前没有对cuda内存初始化则会报错:

报错信息:

pycuda._driver.LogicError: cuFuncSetBlockShape failed: invalid resource handle

如果再cuda函数初始化之前对cuda内存初始化那么就不会报错:

代码:

import numpy as np

from warp_drive.managers.data_manager import CUDADataManager
from warp_drive.managers.function_manager import (
CUDAFunctionManager, CUDALogController, CUDASampler, CUDAEnvironmentReset
)
from warp_drive.utils.data_feed import DataFeed source_code = """
// A function to demonstrate how to manipulate data on the GPU.
// This function increments each the random data array we pushed to the GPU before.
// Each index corresponding to (env_id, agent_id) in the array is incremented by "agent_id + env_id".
// Everything inside the if() loop runs in parallel for each agent and environment.
//
extern "C"{
__global__ void cuda_increment(
float* data,
int num_agents
)
{
int env_id = blockIdx.x;
int agent_id = threadIdx.x;
if (agent_id < num_agents){
int array_index = env_id * num_agents + agent_id;
int increment = env_id + agent_id;
data[array_index] += increment;
}
}
}
""" from timeit import Timer def push_random_data_and_increment_timer(
num_runs=1,
num_envs=2,
num_agents=3,
source_code=None
): assert source_code is not None def push_random_data(num_agents, num_envs):
# Initialize the CUDA data manager
cuda_data_manager = CUDADataManager(
num_agents=num_agents,
num_envs=num_envs,
episode_length=100
) # Create random data
random_data = np.random.rand(num_envs, num_agents) # Push data from host to device
data_feed = DataFeed()
data_feed.add_data(
name="random_data",
data=random_data,
)
data_feed.add_data(
name="num_agents",
data=num_agents
)
cuda_data_manager.push_data_to_device(data_feed) return cuda_data_manager # Initialize the CUDA function manager
def cuda_func_init():
cuda_function_manager = CUDAFunctionManager(
num_agents=num_agents, #cuda_data_manager.meta_info("n_agents"),
num_envs=num_envs #cuda_data_manager.meta_info("n_envs")
) # Load source code and initialize function
cuda_function_manager.load_cuda_from_source_code(
source_code,
default_functions_included=False
)
cuda_function_manager.initialize_functions(["cuda_increment"])
increment_function = cuda_function_manager._get_function("cuda_increment") return cuda_function_manager, increment_function def increment_data(cuda_data_manager, cuda_function_manager, increment_function):
increment_function(
cuda_data_manager.device_data("random_data"),
cuda_data_manager.device_data("num_agents"),
block=cuda_function_manager.block,
grid=cuda_function_manager.grid
) # set variable
# cuda_data_manager = push_random_data(num_agents, num_envs) # cuda function init
# cuda_function_manager, increment_function = cuda_func_init() # cuda function run
# increment_data(cuda_data_manager, cuda_function_manager, increment_function) #data_push_time = Timer(lambda: push_random_data(num_agents, num_envs)).timeit(number=num_runs) cuda_data_manager = push_random_data(num_agents, num_envs) ###
cuda_function_manager, increment_function = cuda_func_init() ###
program_run_time = Timer(lambda: increment_data(cuda_data_manager, cuda_function_manager, increment_function)).timeit(number=num_runs)
print(cuda_data_manager.pull_data_from_device('random_data')) return {
"data push times": '0', #data_push_time,
"code run time": program_run_time
} num_runs = 1000
times = {} for scenario in [
(1, 1),
(1, 100),
(1, 1000),
(100, 1000),
(1000, 1000)
]:
num_envs, num_agents = scenario
times.update(
{
f"envs={num_envs}, agents={num_agents}":
push_random_data_and_increment_timer(
num_runs,
num_envs,
num_agents,
source_code
)
}
) print(f"Times for {num_runs} function calls")
print("*"*40)
for key, value in times.items():
print(f"{key:30}: mean data push times: {value['data push times']:10.5}s,\t mean increment times: {value['code run time']:10.5}s") '''
print(cuda_data_manager._meta_info)
print(cuda_data_manager._host_data)
print(cuda_data_manager._device_data_pointer)
print(cuda_data_manager._scalar_data_list)
print(cuda_data_manager._reset_data_list)
print(cuda_data_manager._log_data_list)
print(cuda_data_manager._device_data_via_torch)
print(cuda_data_manager._shared_constants)
print(cuda_data_manager._shape)
print(cuda_data_manager._dtype) print(tensor_on_device)
time.sleep(300) '''

神奇的是不论在cuda函数初始化之前对cuda内存初始化多大空间的内存都不会再报错,这也是该问题神奇的地方所在。

如下代码:

import numpy as np

from warp_drive.managers.data_manager import CUDADataManager
from warp_drive.managers.function_manager import (
CUDAFunctionManager, CUDALogController, CUDASampler, CUDAEnvironmentReset
)
from warp_drive.utils.data_feed import DataFeed source_code = """
// A function to demonstrate how to manipulate data on the GPU.
// This function increments each the random data array we pushed to the GPU before.
// Each index corresponding to (env_id, agent_id) in the array is incremented by "agent_id + env_id".
// Everything inside the if() loop runs in parallel for each agent and environment.
//
extern "C"{
__global__ void cuda_increment(
float* data,
int num_agents
)
{
int env_id = blockIdx.x;
int agent_id = threadIdx.x;
if (agent_id < num_agents){
int array_index = env_id * num_agents + agent_id;
int increment = env_id + agent_id;
data[array_index] += increment;
}
}
}
""" from timeit import Timer def push_random_data_and_increment_timer(
num_runs=1,
num_envs=2,
num_agents=3,
source_code=None
): assert source_code is not None def push_random_data(num_agents, num_envs):
# Initialize the CUDA data manager
cuda_data_manager = CUDADataManager(
num_agents=num_agents,
num_envs=num_envs,
episode_length=100
) # Create random data
random_data = np.random.rand(num_envs, num_agents) # Push data from host to device
data_feed = DataFeed()
data_feed.add_data(
name="random_data",
data=random_data,
)
data_feed.add_data(
name="num_agents",
data=num_agents
)
cuda_data_manager.push_data_to_device(data_feed) return cuda_data_manager # Initialize the CUDA function manager
def cuda_func_init():
cuda_function_manager = CUDAFunctionManager(
num_agents=num_agents, #cuda_data_manager.meta_info("n_agents"),
num_envs=num_envs #cuda_data_manager.meta_info("n_envs")
) # Load source code and initialize function
cuda_function_manager.load_cuda_from_source_code(
source_code,
default_functions_included=False
)
cuda_function_manager.initialize_functions(["cuda_increment"])
increment_function = cuda_function_manager._get_function("cuda_increment") return cuda_function_manager, increment_function def increment_data(cuda_data_manager, cuda_function_manager, increment_function):
increment_function(
cuda_data_manager.device_data("random_data"),
cuda_data_manager.device_data("num_agents"),
block=cuda_function_manager.block,
grid=cuda_function_manager.grid
) # set variable
# cuda_data_manager = push_random_data(num_agents, num_envs) # cuda function init
# cuda_function_manager, increment_function = cuda_func_init() # cuda function run
# increment_data(cuda_data_manager, cuda_function_manager, increment_function) #data_push_time = Timer(lambda: push_random_data(num_agents, num_envs)).timeit(number=num_runs) push_random_data(1, 1) cuda_function_manager, increment_function = cuda_func_init() ###
cuda_data_manager = push_random_data(num_agents, num_envs) ###
program_run_time = Timer(lambda: increment_data(cuda_data_manager, cuda_function_manager, increment_function)).timeit(number=num_runs)
print(cuda_data_manager.pull_data_from_device('random_data')) return {
"data push times": '0', #data_push_time,
"code run time": program_run_time
} num_runs = 1000
times = {} for scenario in [
(1, 1),
(1, 100),
(1, 1000),
(100, 1000),
(1000, 1000)
]:
num_envs, num_agents = scenario
times.update(
{
f"envs={num_envs}, agents={num_agents}":
push_random_data_and_increment_timer(
num_runs,
num_envs,
num_agents,
source_code
)
}
) print(f"Times for {num_runs} function calls")
print("*"*40)
for key, value in times.items():
print(f"{key:30}: mean data push times: {value['data push times']:10.5}s,\t mean increment times: {value['code run time']:10.5}s") '''
print(cuda_data_manager._meta_info)
print(cuda_data_manager._host_data)
print(cuda_data_manager._device_data_pointer)
print(cuda_data_manager._scalar_data_list)
print(cuda_data_manager._reset_data_list)
print(cuda_data_manager._log_data_list)
print(cuda_data_manager._device_data_via_torch)
print(cuda_data_manager._shared_constants)
print(cuda_data_manager._shape)
print(cuda_data_manager._dtype) print(tensor_on_device)
time.sleep(300) '''

核心代码:

    push_random_data(1, 1)

    cuda_function_manager, increment_function = cuda_func_init()  ###
cuda_data_manager = push_random_data(num_agents, num_envs) ###
program_run_time = Timer(lambda: increment_data(cuda_data_manager, cuda_function_manager, increment_function)).timeit(number=num_runs)

下面代码为cuda的内存申请,即使是较小的内存申请也是可以是下面的cuda函数初始化正常运行,如果再cuda函数初始化之前没有任何对cudsa内存申请的操作那就会报错。

push_random_data(1, 1)

初始化cuda内存,cuda内存的申请操作:

    push_random_data(1, 1)

cuda函数的初始化操作:
cuda_function_manager, increment_function = cuda_func_init() ###

cuda 函数的执行:

increment_data(cuda_data_manager, cuda_function_manager, increment_function)

pycuda学习过程中的一些发现,cuda函数的初始化要在cuda内存空间初始化之后,否则会报错的更多相关文章

  1. 如果在Yii中,使用AR查询,不直接写sql,则在使用的时候会报错

    如果在Yii中,使用AR查询,不直接写sql,则在使用的时候会报错 Student::find() ->select("id,name,from_unixtime(create_tim ...

  2. JAVA_用_JCO连接_SAP,实现调用SAP_的_RFC_函数(整理)(附一篇看起来比较全面的说明)(JCO报错信息)

    // 获取RFC返回的字段值 11 JCoParameterList exportParam = function.getExportParameterList(); 12 String exPara ...

  3. vue中"‘webpack-dev-server’不是内部或外部命令,也不是可运行的程序"的报错

    在vue项目中发现了这个报错  解决办法将项目里的“node_modules”文件夹删除,然后重新运行cnpm install

  4. 在myeclipse中maven项目关于ssh整合时通过pom.xml导入依赖是pom.xml头部会报错

    错误如下 ArtifactTransferException: Failure to transfer org.springframework:spring-jdbc:jar:3.0.5.RELEAS ...

  5. free()函数释放一段分配的内存之陷阱

    朋友们对malloc函数应该是比较熟悉了,此函数功能是分配一段内存地址,并且将内存地址给一个指针变量,最后记得再调用free函数释放这段内存地址就可以了,标准的流程对吧,好像没什么问题.但是按照此标准 ...

  6. Unity C# 调用 C++ DLL 并在 DLL 中调用 C# 的回调函数

    Unity C# 调用 C++ DLL 并在 DLL 中调用 C# 的回调函数~~~    呵呵... 看着有点晕.. 再解释一下就是 在Unity中 使用 C# 调用 C++ 写的 DLL, 但是在 ...

  7. LoadRunner中常用的字符串操作函数

    LoadRunner中常用的字符串操作函数有:                strcpy(destination_string, source_string);               strc ...

  8. 【Python】从简单案列中揭示常用内置函数以及数据类型

    前面提到了BIF(内置函数)这个概念,什么是内置函数,就是python已经定义好的函数,不需要人为再自己定义,直接拿来就可以用的函数,那么都有哪些BIF呢? 可以在交互式界面(IDLE)输入这段代码, ...

  9. C语言calloc()函数:分配内存空间并初始化——stm32中的应用

    经常在代码中看到使用malloc来分配,然后memset清零,其实calloc更加方便,一句顶两句~ 头文件:#include <stdlib.h> calloc() 函数用来动态地分配内 ...

  10. 2018最新win10 安装tensorflow1.4(GPU/CPU)+cuda8.0+cudnn8.0-v6 + keras 安装CUDA失败 导入tensorflow失败报错问题解决

    原文作者:aircraft 原文链接:https://www.cnblogs.com/DOMLX/p/9747019.html 基本开发环境搭建 1. Microsoft Windows 版本 关于W ...

随机推荐

  1. 编程语言界的丐帮 C#.NET 国密数字信封 民生银行

    民生银行的库DLL只有C版本和JAVA版本.按着JAVA版本做的C# 实现. 重点内容. 1.数字信封就是 CmsEnvelopedData Der编码后转BASE64 2.重点类:ContentIn ...

  2. 夜莺官方文档优化第一弹:手把手教你部署和架构讲解,消灭所有部署失败的 case!干!

    前置说明 各种环境的选型建议 Docker compose 方式:仅仅用于简单测试,不推荐在生产环境使用 Docker compose,升级起来挺麻烦的,除非你对 Docker compose 真的很 ...

  3. koishi常用插件推荐

    今天给大家做一个常用插件的推荐 以下将插件归为几个大类,按类型推荐 1. 日常相关 点歌 插件名:koishi-plugin-music-downloadvoice-api 功能介绍: 语音点歌 - ...

  4. 在线HMAC加密工具

    在线HMAC加密工具提供一站式服务,支持MD5至SHA512.RIPEMD160及SM3等多种哈希算法,用户可便捷选择算法并生成安全的HMAC散列值,确保消息完整性与验证来源.适用于开发调试.网络安全 ...

  5. RequestBodyAdvice和注解方式进行统一参数处理demo

    RequestBodyAdvice和注解方式进行统一参数处理demo @Target({ ElementType.METHOD, ElementType.TYPE }) @Retention(Rete ...

  6. Mirror多人联网发布阿里云

    Mirror多人联网发布阿里云 新建模板小书匠 将mirror网络地址和端口选为你阿里云服务器上开放的公网地址和端口 IP与端口 2. 在阿里云服务器安全组中开放你所制定的端口 开放阿里云端口 3. ...

  7. Ansible的常用模块

    目录 ansible常用模块 1. file模块 1.1 file模块的选项 1.2 file模块的使用 1.2.1 使用file模块在远程主机创建文件 1.2.2 创建目录 1.2.3 删除文件/目 ...

  8. Nuxt 3 路由系统详解:配置与实践指南

    title: Nuxt 3 路由系统详解:配置与实践指南 date: 2024/6/21 updated: 2024/6/21 author: cmdragon excerpt: 摘要:本文是一份关于 ...

  9. springboot使用mail提示没有该类型的bean

    @Autowired private JavaMailSenderImpl javaMailSender; 自动注入时提示没有该类型的Bean. 原因 没有配置邮件发送相关的配置信息. spring: ...

  10. 3.8折钜惠,瑞芯微RK3568J国产工业评估板“限时折扣”!