【异构计算】OpenCL矩阵转置

介绍

矩阵转置，主要的技巧还是利用好local memory ,防止local memory,以及glabol memory的读取尽量是合并读写。

完整代码一：

main.cpp代码

#include <iostream>

#include <string>

#include <fstream>

#include <sstream>

#include <time.h>

#ifdef _APPLE_

#include <OpenCL/OpenCL.h>

#else

#include <CL/cl.h>

#endif

#define MATRIXMULLTIPLY

#define N  6

#define K  8

#define L  5

//Functio to check and handle OpenCL errors

inline void checkErr(cl_int err,const char *name)

{

	if(err !=CL_SUCCESS)

	{

		std::cerr <<"ERROR: "<< name <<"("<<err<< ")"<<std::endl;

		exit(EXIT_FAILURE);

	}

}

cl_context CreateContext()

{

    cl_int errNum;

    cl_uint numPlatforms;

    cl_platform_id firstPlatformId;

    cl_context context = NULL;

    // First, select an OpenCL platform to run on.  For this example, we simply choose the first available platform.  Normally, you would

    // query for all available platforms and select the most appropriate one.

    errNum = clGetPlatformIDs(1, &firstPlatformId, &numPlatforms);

    if (errNum != CL_SUCCESS || numPlatforms <= 0)

    {

        std::cerr << "Failed to find any OpenCL platforms." << std::endl;

        return NULL;

    }

    // Next, create an OpenCL context on the platform.  Attempt to create a GPU-based context, and if that fails, try to create

    // a CPU-based context.

    cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM,(cl_context_properties)firstPlatformId, 0 };

	context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_GPU,NULL, NULL, &errNum);

    if (errNum != CL_SUCCESS)

    {

        std::cout << "Could not create GPU context, trying CPU..." << std::endl;

        context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_CPU,NULL, NULL, &errNum);

        if (errNum != CL_SUCCESS)

        {

            std::cerr << "Failed to create an OpenCL GPU or CPU context." << std::endl;

            return NULL;

        }

    }

    return context;

}

int main( int argc, char * argv[])

{

	// Use the first platform

	cl_int errNum;

	cl_platform_id platformID;

	cl_context context =NULL;

	cl_device_id  device;

	errNum = clGetPlatformIDs(1,&platformID,NULL);

	checkErr(errNum,"clGetPlatformIDS");

	std::cout<<"Platform ID: "<<platformID<<std::endl;

	context = CreateContext( );

	if(context == NULL)

	{

		std::cerr << "Failed to create OpenCL context." << std::endl;

        return NULL;

	}

	errNum = clGetDeviceIDs(platformID,CL_DEVICE_TYPE_GPU,1,&device,NULL);

	if(errNum !=CL_SUCCESS)

	{

		std::cerr <<"Could not create CL_DEVICE_TYPE_GPU context, trying CL_DEVICE_TYPE_CPU..."<<std::endl;

		errNum =clGetDeviceIDs(platformID,CL_DEVICE_TYPE_CPU,1,&device,NULL);

		std::cout <<"Device: "<<device<<std::endl;

		if(errNum !=CL_SUCCESS)

		{

			checkErr(errNum,"clGetDeviceIDs(..CL_DEVICE_TYPE_ALL..)");

		}

	}

	cl_command_queue commandQueue = clCreateCommandQueue(context,device,0,&errNum);

	checkErr(errNum,"clCreateCommandQueue( )");

	cl_int Mat_A_width  = N;

	cl_int Mat_A_height = K;

	cl_int Mat_B_width  = K;

	cl_int Mat_B_height = L;

	float *MatA =(float*)malloc(sizeof(float)*Mat_A_width*Mat_A_height);

	if(MatA ==NULL)

	{

		std::cerr<<"Failed to  Allocationing Memmey ."<<std::endl;

	}

#ifdef MATRIXMULLTIPLY

	float *MatB =(float*)malloc(sizeof(float)*Mat_B_width*Mat_B_height);

	float *MatC =(float*)malloc(sizeof(float)*Mat_A_width*Mat_B_height);

#else

	float *MatC =(float*)malloc(sizeof(float)*Mat_A_width*Mat_A_height);

#endif

	std::cout<<"=====MatA: " << Mat_A_width << "X" << Mat_A_height ;//<< std::endl;

	for(int i = 0; i< Mat_A_width*Mat_A_height; i++)

	{

		MatA[i] = std::rand()*0.25;

		//MatA[i] = 4.5;

		if((i%Mat_A_height ==0)||(i == 0))

		{

			std::cout << std::endl;

		}

		std::cout<<MatA[i]<< "\t";

	}

	std::cout<<std::endl;

	//Allocate space for Matrix A on the device

	cl_mem bufferA = clCreateBuffer(context,

									CL_MEM_READ_ONLY,//|CL_MEM_COPY_HOST_PTR,

									Mat_A_width*Mat_A_height*sizeof(float),

									NULL,

									&errNum);

	checkErr(errNum,"clCreateBuffer(...bufferA..)");

	errNum = clEnqueueWriteBuffer(commandQueue,bufferA,CL_TRUE,0,Mat_A_width*Mat_A_height*sizeof(float),(void*)MatA, 0, NULL,NULL);

#ifdef MATRIXMULLTIPLY

	std::cout<<"MatB: "<<Mat_B_width <<"X"<<Mat_B_height<<std::endl;

	for(int i = 0; i< Mat_B_width*Mat_B_height; i++)

	{

		MatB[i] = std::rand()*0.25;

		//MatB[i] = 2.0;

		if((i%Mat_B_height ==0)||(i == 0))

		{

			std::cout << std::endl;

		}

		std::cout<<MatA[i]<< " ";

	}

	std::cout<<std::endl;

    //Allocate space for Matrix B on the device

	cl_mem bufferB = clCreateBuffer(context,

									CL_MEM_READ_ONLY,//|CL_MEM_COPY_HOST_PTR,

									Mat_B_width*Mat_B_height*sizeof(float),

									NULL,

									&errNum);

	checkErr(errNum,"clCreateBuffer(...bufferB..)");

	//Copy Matrix B to the device

	errNum = clEnqueueWriteBuffer(commandQueue,bufferB,CL_TRUE,	0,Mat_B_width*Mat_B_height*sizeof(float),(void*)MatB,0,NULL,NULL);

    //Allocate space for Matrix C on the device

	cl_mem bufferC = clCreateBuffer(context,

									CL_MEM_READ_ONLY,//|CL_MEM_COPY_HOST_PTR,

									Mat_A_width*Mat_B_height*sizeof(float),

									NULL,

									&errNum);

	checkErr(errNum,"clCreateBuffer(...bufferC..)");

#else

	//Allocate space for Matrix C on the device

	cl_mem bufferC = clCreateBuffer(context,

									CL_MEM_READ_ONLY,//|CL_MEM_COPY_HOST_PTR,

									Mat_A_width*Mat_A_height*sizeof(float),

									NULL,

									&errNum);

	checkErr(errNum,"clCreateBuffer(...bufferC..)");

#endif	

    // We assume that the program source si stroed int the variable

	cl_program program;

	const char* fileName = "Matrixkernel.cl";

	std::ifstream kernelFile(fileName,std::ios::in);

	if( !kernelFile.is_open())

	{

		std::cerr <<"Failed to open file reading:"<<fileName<<std::endl;

		return NULL;

	}

	std::ostringstream oss;

	oss << kernelFile.rdbuf();

	std::string srcStdStr = oss.str();

    const char *srcStr = srcStdStr.c_str();

    program = clCreateProgramWithSource(context, 1,(const char**)&srcStr,NULL, NULL);

    if (program == NULL)

    {

        std::cerr << "Failed to create OpenCL program from source." << std::endl;

        return NULL;

    }

    errNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

    if (errNum != CL_SUCCESS)

    {

        // Determine the reason for the error

        char buildLog[16384];

        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,sizeof(buildLog), buildLog, NULL);

        std::cerr << "Error in kernel: " << std::endl;

        std::cerr << buildLog;

        clReleaseProgram(program);

        return NULL;

    }

#ifdef  MATRIXMULLTIPLY

	// Create the kernel

	cl_kernel kernel = clCreateKernel(program,"MulltiplySample",NULL);

	if(kernel ==NULL)

	{

		std::cerr<<"Faile to create kernel."<<std::endl;

		return NULL;

	}

	//set the kernel arguments

	clSetKernelArg(kernel, 0,sizeof(cl_mem), (void*) &bufferC);

	clSetKernelArg(kernel, 1,sizeof(cl_int), (void*) &Mat_A_width);

	clSetKernelArg(kernel, 2,sizeof(cl_int), (void*) &Mat_A_height);

	clSetKernelArg(kernel, 3,sizeof(cl_int), (void*) &Mat_B_width);

	clSetKernelArg(kernel, 4,sizeof(cl_int), (void*) &Mat_B_height);

	clSetKernelArg(kernel, 5,sizeof(cl_mem), (void*) &bufferA);

	clSetKernelArg(kernel, 6,sizeof(cl_mem), (void*) &bufferB);

	//Set Local and global workgroup sizes

	size_t globalws[2]={Mat_A_width,Mat_B_height};

	size_t localws[2]={Mat_A_width,Mat_B_height};

	//float strTime = clock();

	//Execte the kernel

	errNum = clEnqueueNDRangeKernel(commandQueue,kernel,2,NULL,globalws,localws,0,NULL,NULL);

	if(errNum !=CL_SUCCESS)

	{

		std::cerr<<"Faile to Execte the kernal.."<<std::endl;

		return NULL;

	}

	errNum = clEnqueueReadBuffer(commandQueue,bufferC,CL_TRUE,0,Mat_B_height*Mat_A_width*sizeof(float),(void*)MatC,0,NULL,NULL);

	std::cout<<"MatrixC:"<<Mat_A_width<<"X"<<Mat_B_height<<std::endl;

	for(int i =0; i< Mat_A_width*Mat_B_height; i++)

	{

		if((i != 0)&&(i%Mat_B_height == 0))

		{

			std::cout<<std::endl;

		}

		std::cout<<MatC[i]<<"\t";

	}

	std::cout << std::endl;

	clReleaseKernel(kernel);

#else

	cl_kernel Trapsposekernel;

	cl_int blockSize =16;

	if(Mat_A_width*Mat_A_height >1000)

	{

		Trapsposekernel = clCreateKernel(program,"MatrixTranspose",NULL);

		std::cout<<"CreateKernel in MatrixTranspose"<<std::endl;

		if(Trapsposekernel == NULL)

		{

			std::cerr<<"Faile to Create TrapsposeKernel."<< std::endl;

			return NULL;

		}

		clSetKernelArg(Trapsposekernel, 0,sizeof(cl_mem), (void*) &bufferC);

		clSetKernelArg(Trapsposekernel, 1,sizeof(cl_mem), (void*) &bufferA);

		clSetKernelArg(Trapsposekernel, 2,sizeof(cl_float)*blockSize*blockSize,NULL); //

		clSetKernelArg(Trapsposekernel, 3,sizeof(cl_int), (void*) &Mat_A_width);

		clSetKernelArg(Trapsposekernel, 4,sizeof(cl_int), (void*) &Mat_A_height);

		clSetKernelArg(Trapsposekernel, 5,sizeof(cl_mem), (void*) &blockSize);	//

	}

	else

	{

		Trapsposekernel = clCreateKernel(program,"TrapsposeMatrixSample",NULL);

		std::cout<<"CreateKernel in TrapsposeMatrixSample"<<std::endl;

		if(Trapsposekernel == NULL)

		{

			std::cerr<<"Faile to Create TrapsposeKernel."<< std::endl;

			return NULL;

		}

		clSetKernelArg(Trapsposekernel, 0,sizeof(cl_mem), (void*) &bufferC);

		clSetKernelArg(Trapsposekernel, 1,sizeof(cl_int), (void*) &Mat_A_width);

		clSetKernelArg(Trapsposekernel, 2,sizeof(cl_int), (void*) &Mat_A_height);

		clSetKernelArg(Trapsposekernel, 3,sizeof(cl_mem), (void*) &bufferA);

	}

	size_t localtr[2] = {Mat_A_height,Mat_A_width};

#ifdef MATRIXMULLTIPLY

	size_t globaltr[2] = {Mat_A_width,Mat_B_height}

#else

	size_t globaltr[2] = {Mat_A_height,Mat_A_width};

#endif //MATRIXMULLTIPLY

	cl_event  dev;

	//commandQueue the kernel up for executio across the array

	errNum = clEnqueueNDRangeKernel(commandQueue,Trapsposekernel,2,NULL,globaltr,localtr,0,NULL,&dev);

	if(errNum !=CL_SUCCESS)

	{

		std::cerr<<"Faile to Execte the kernel.."<<std::endl;

		return NULL;

	}

	std::cout<<"CommandQueue: "<<commandQueue<<std::endl;

	clFinish(commandQueue);

	cl_ulong startTime, endTime;

	clGetEventProfilingInfo(dev, CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &startTime, NULL);

	clGetEventProfilingInfo(dev, CL_PROFILING_COMMAND_END,	sizeof(cl_ulong), &endTime, NULL);

	cl_ulong kernelExecTimeNs = endTime-startTime;

	printf("simple kernal exec time :%8.6f ms\n", kernelExecTimeNs*1e-6 );

	errNum = clEnqueueReadBuffer(commandQueue,bufferC,CL_TRUE,0,Mat_A_width*Mat_A_height*sizeof(float),(void*)MatC,0,NULL,NULL);

	std::cout<<"====Trapspose MatrixA : "<<Mat_A_height<<"X"<<Mat_A_width<<std::endl;

	for(int i =0; i< Mat_A_width*Mat_A_height; i++)

	{

		if((i != 0)&&(i%Mat_A_width == 0))

		{

			std::cout<<std::endl;

		}

		std::cout<<MatC[i]<<"\t";

	}

	std::cout << std::endl;

#endif

	clReleaseProgram(program);

	clReleaseCommandQueue(commandQueue);

	clReleaseContext(context);

	delete[] MatA;

	//delete[] MatB;

	delete[] MatC;

	return 0;

}

kernel代码

/*

 *@param outputC output Matrix

 *@param widthA is width of intputA in the Matrix A

 *@param heightA is height of intputA in the Matrix A

 *@param widthB is width of intputB in the Matrix B

 *@param heightB is height of intputB in the Matrix B

 *@param inputA is width of intputA in the Matrix A

 *@param inputB is width of intputA in the Matrix B

 */

__kernel void MulltiplySample(__global float* outputC,

						const int widthA,

						const int heightA,

						const int widthB,

						const int heightB,

						__global float* inputA,

						__global float* inputB)

{

	int row = get_global_id(1);	// Get global position in Y direction

	int col = get_global_id(0);	// Get global position in X direction

	float sum = 0.0f;

	//Calculat result of one element of Matrix C

	for( int i = 0; i< widthA; i++)

	{

		sum += inputA[row * widthA+i] * inputB[i * widthB + col];

	}

	outputC[row * widthB+col] = sum;

}

/*

 *@param TrapsposeMatrix  output Matrix

 *@param width  is InputMatrix width

 *@param height  is InputMatrix height

 *@param InputMatrix is Input Matrix

 */

__kernel void TrapsposeMatrixSample(__global float* TrapsposeMatrix,

									const uint width, const uint height,

									__global float* InputMatrix)

{

	int row = get_global_id(0);

	int col = get_global_id(1);

	TrapsposeMatrix[row * width +col] = InputMatrix[col * height + row];

}

/*

 * Copies a block to the local memory

 * and copies back the transpose from local memory to output

 * @param output output matrix

 * @param input  input matrix

 * @param block  local memory of size blockSize x blockSize

 * @param width  width of the input matrix

 * @param height height of the input matrix

 * @param blockSize size of the block

 */

__kernel void MatrixTranspose(__global float * output,

							  __global float * input,

							  __local  float * block,

							  const    uint    width,

							  const    uint    height,

							  const    uint blockSize)

{

	uint globalIdx = get_global_id(0);

	uint globalIdy = get_global_id(1);

	uint localIdx = get_local_id(0);

	uint localIdy = get_local_id(1);

    /* copy from input to local memory */

	block[localIdy*blockSize + localIdx] = input[globalIdy*width + globalIdx];

    /* wait until the whole block is filled */

	barrier(CLK_LOCAL_MEM_FENCE);

	uint groupIdx = get_group_id(0);

	uint groupIdy = get_group_id(1);

    /* calculate the corresponding target location for transpose  by inverting x and y values*/

	uint targetGlobalIdx = groupIdy*blockSize + localIdy;

	uint targetGlobalIdy = groupIdx*blockSize + localIdx;

    /* calculate the corresponding raster indices of source and target */

	uint targetIndex  = targetGlobalIdy*height     + targetGlobalIdx;

	uint sourceIndex  = localIdy       * blockSize + localIdx;

	output[targetIndex] = block[sourceIndex];

}

测试结果输出

完整代码二：

maincpp代码

// Matrix.cpp : Defines the entry point for the console application.

#include "stdafx.h"

#include <CL/cl.h>

#include <stdio.h>

#include <stdlib.h>

#include <time.h>

#include <iostream>

#include <fstream>

using namespace std;

#pragma comment (lib,"OpenCL.lib")

#define M 2048

int convertToString(const char *filename, std::string& s)

{

	size_t size;

	char*  str;

	std::fstream f(filename, (std::fstream::in | std::fstream::binary));

	if(f.is_open())

	{

		size_t fileSize;

		f.seekg(0, std::fstream::end);

		size = fileSize = (size_t)f.tellg();

		f.seekg(0, std::fstream::beg);

		str = new char[size+1];

		if(!str)

		{

			f.close();

			return NULL;

		}

		f.read(str, fileSize);

		f.close();

		str[size] = '\0';

		s = str;

		delete[] str;

		return 0;

	}

	printf("Error: Failed to open file %s\n", filename);

	return 1;

}

int main(int argc, char* argv[])

{

	float *src1=0;

	float *src2=0;

	src1 = (float*)malloc(M*M*sizeof(float));

	src2 = (float*)malloc(M*M*sizeof(float));

	int i, j;

	srand( (unsigned)time( NULL ) );

	for(i = 0; i < M*M; i++)

		src1[i] = rand()%50;

	for( i=0; i < M; i++)

	{

		for(j=0; j < M; j++)

		{

			src2[i*M+j] = src1[j*M+i];

		}

	}

	cl_uint status;

	cl_platform_id platform;

	status = clGetPlatformIDs( 1, &platform, NULL );

	cl_device_id device;

	clGetDeviceIDs( platform, CL_DEVICE_TYPE_ALL,1,	&device,NULL);

	cl_context context = clCreateContext( NULL,	1,&device,NULL, NULL, NULL);

	cl_command_queue queue = clCreateCommandQueue( context,device,

		CL_QUEUE_PROFILING_ENABLE, NULL );

	cl_mem clsrc1 = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,

		M*M*sizeof(cl_float),src1,NULL );

	cl_mem clsrc2 = clCreateBuffer( context,CL_MEM_WRITE_ONLY,

		M*M * sizeof(cl_float),	NULL, NULL );

	const char * filename  = "transpose.cl";

	std::string  sourceStr;

	status = convertToString(filename, sourceStr);

	const char * source    = sourceStr.c_str();

	size_t sourceSize[]    = { strlen(source) };

	cl_program program = clCreateProgramWithSource(context,	1, &source,sourceSize,NULL);

	status = clBuildProgram( program, 1, &device, NULL, NULL, NULL );

	if(status != 0)

	{

		printf("clBuild failed:%d\n", status);

		char tbuf[0x10000];

		clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0x10000, tbuf, NULL);

		printf("\n%s\n", tbuf);

		return -1;

	}

	cl_kernel kernel = clCreateKernel( program, "matrixTransposeSimple", NULL );

	cl_int dimx = M;

	cl_int dimy = M;

	clSetKernelArg(kernel, 0, sizeof(cl_mem),  (void *)&clsrc2);

    clSetKernelArg(kernel, 1, sizeof(cl_mem),  (void *)&clsrc1);

	clSetKernelArg(kernel, 2, sizeof(cl_int),  (void *)&dimx);

	clSetKernelArg(kernel, 3, sizeof(cl_int),  (void *)&dimy);

	//Set local and global workgroup sizes

	size_t localws[2] = {16, 16} ;

	size_t globalws[2] = {M,M};

    cl_event ev;

	clEnqueueNDRangeKernel( queue ,kernel,2, 0, globalws, localws,0, NULL, &ev);

	clFinish( queue );

	cl_ulong startTime, endTime;

	clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &startTime, NULL);

	clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_END,sizeof(cl_ulong), &endTime, NULL);

	cl_ulong kernelExecTimeNs = endTime-startTime;

	printf("simple kernal exec time :%8.6f ms\n ", kernelExecTimeNs*1e-6 );

    float *op_data = 0;

	// copy results from device back to host

	op_data = (cl_float *) clEnqueueMapBuffer(queue,clsrc2,CL_TRUE,	CL_MAP_READ,0,

		M*M*sizeof(cl_float),0, NULL, NULL, NULL );

	for(i = 0; i < M*M; i++)

	{

		if(abs(src2[i] - op_data[i]) > 0.0001)

		{

			printf("check failed\n");

			break;

		}

	}

	if(i == M*M)

		printf("check passed\n");

	cl_uint blockSize = 16;

	kernel = clCreateKernel( program, "matrixTranspose", NULL );

	clSetKernelArg(kernel, 0, sizeof(cl_mem),  (void *)&clsrc2);

	clSetKernelArg(kernel, 1, sizeof(cl_mem),  (void *)&clsrc1);

    clSetKernelArg(kernel, 2, sizeof(cl_float)*blockSize*blockSize, NULL);

	clSetKernelArg(kernel, 3, sizeof(cl_int),  (void *)&dimx);

	clSetKernelArg(kernel, 4, sizeof(cl_int),  (void *)&dimy);

	clSetKernelArg(kernel, 5, sizeof(cl_int),  (void *)&blockSize);

	clEnqueueNDRangeKernel(queue ,kernel,2, 0, globalws, localws,0, NULL, &ev);

	clFinish( queue );

	clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_START,	sizeof(cl_ulong), &startTime, NULL);

	clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_END,sizeof(cl_ulong), &endTime, NULL);

	kernelExecTimeNs = endTime-startTime;

	printf("kernal exec time :%8.6f ms\n ", kernelExecTimeNs*1e-6 );

	// copy results from device back to host

	op_data = (cl_float *) clEnqueueMapBuffer( queue,clsrc2,CL_TRUE,CL_MAP_READ,0,

		M*M*sizeof(cl_float),0, NULL, NULL, NULL );

	for(i = 0; i < M*M; i++)

	{

		if(abs(src2[i] - op_data[i]) > 0.0001)

		{

			printf("check failed\n");

			break;

		}

	}

	if(i == M*M)

		printf("check passed\n");

	if(src1)

		free(src1);

	if(src2)

		free(src2);

	clReleaseMemObject(clsrc1);

	clReleaseMemObject(clsrc2);

	clReleaseProgram(program);

	clReleaseCommandQueue(queue);

	clReleaseContext(context);

	return 0;

}

kernel代码

/*

 * Copies a block to the local memory

 * and copies back the transpose from local memory to output

 * @param output output matrix

 * @param input  input matrix

 * @param block  local memory of size blockSize x blockSize

 * @param width  width of the input matrix

 * @param height height of the input matrix

 * @param blockSize size of the block

 */

__kernel

void matrixTranspose(__global float * output,

                     __global float * input,

                     __local  float * block,

                     const    uint    width,

                     const    uint    height,

                     const    uint blockSize

                       )

{

	uint globalIdx = get_global_id(0);

	uint globalIdy = get_global_id(1);

	uint localIdx = get_local_id(0);

	uint localIdy = get_local_id(1);

    /* copy from input to local memory */

	block[localIdy*blockSize + localIdx] = input[globalIdy*width + globalIdx];

    /* wait until the whole block is filled */

	barrier(CLK_LOCAL_MEM_FENCE);

	uint groupIdx = get_group_id(0);

	uint groupIdy = get_group_id(1);

    /* calculate the corresponding target location for transpose  by inverting x and y values*/

	uint targetGlobalIdx = groupIdy*blockSize + localIdy;

	uint targetGlobalIdy = groupIdx*blockSize + localIdx;

    /* calculate the corresponding raster indices of source and target */

	uint targetIndex  = targetGlobalIdy*height     + targetGlobalIdx;

	uint sourceIndex  = localIdy       * blockSize + localIdx;

	output[targetIndex] = block[sourceIndex];

}

__kernel void matrixTransposeSimple(__global float * output,

                     __global float * input,

                     const    uint    width,

                     const    uint    height

                       )

{

	uint gdx = get_global_id(0);

	uint gdy = get_global_id(1);

	output[gdy*width+gdx] = input[gdx*height+gdy] ;

}

测试结果输出

========================================================

转载请注明出处：http://blog.csdn.net/songzitea/article/details/12178619

========================================================

【异构计算】OpenCL矩阵转置的更多相关文章

<矩阵的基本操作:矩阵相加，矩阵相乘，矩阵转置>
//矩阵的基本操作:矩阵相加,矩阵相乘,矩阵转置 #include<stdio.h> #include<stdlib.h> #define M 2 #define N 3 #d ...
[转]Python中的矩阵转置
Python中的矩阵转置 via 需求: 你需要转置一个二维数组,将行列互换. 讨论: 你需要确保该数组的行列数都是相同的.比如: arr = [[1, 2, 3], [4, 5, 6], [7, 8 ...
[置顶] [MATLAB技术贴]漫谈MATLAB矩阵转置
矩阵转置是matlab最基本的操作了,但这个基本操作,也是很多初学者容易出现问题的地方.本帖通过几个实例演示matlab矩阵转置的操作. 方法一:' 运算符与 .' 运算符 >>a ...
POJ3268 Silver Cow Party(dijkstra+矩阵转置）
Silver Cow Party Time Limit: 2000MS Memory Limit: 65536K Total Submissions: 15156 Accepted: 6843 ...
矩阵转置 O(1)空间
题目:用O(1)的空间实现矩阵的转置为了方便,使用一维数组来分析.所谓矩阵转置,行变列,列变行.在转置的过程中,有的元素位置是不变的:对于变化位置的元素,要求O(1)空间完成,那么这些位置的变化一定 ...
基于visual Studio2013解决C语言竞赛题之0705矩阵转置
题目解决代码及点评 /* 5. 写一函数,将一个3×3的矩阵转置. */ #include <stdio.h> #include <stdlib.h> void mai ...
Python小代码_5_二维矩阵转置
使用列表推导式实现二维矩阵转置 matrix = [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]] print(matrix) matrix_t = [[ro ...
LHC大神问的矩阵转置问题
数学中线性代数中提到的矩阵转置,其实在我们的业务场景中也有需要的地方,比如LHC大神问到的这个问题那么如何进行行列转换呢? 代码如下: <?php $array=array( '部门1'=&g ...
c++刷题（15/100）矩阵转置，最深子树
题目一:矩阵转置给定一个矩阵 A, 返回 A 的转置矩阵. 矩阵的转置是指将矩阵的主对角线翻转,交换矩阵的行索引与列索引. 示例 1: 输入:[[1,2,3],[4,5,6],[7,8,9]] 输出 ...

随机推荐

UVa 10213 (欧拉公式+Java大数) How Many Pieces of Land ?
题意: 一块圆形土地,在圆周上选n个点,然后两两连线,问把这块土地分成多少块? 分析: 首先紫书上的公式是错的,不过根据书上提供的思路很容易稍加修改得到正确答案! 然后推公式吧,这里用到平面图的欧拉公 ...
（转载）有关反演和gcd
tips : 积性函数 F (n) = Π F (piai ) 若F (n), G (n)是积性函数则 F (n) * G (n) Σd | n F (n) 是积性函数 n = Σd | n φ ( ...
python扩展实现方法--python与c混和编程
前言需要扩展Python语言的理由: 创建Python扩展的步骤 1. 创建应用程序代码 2. 利用样板来包装代码 a. 包含python的头文件 b. 为每个模块的每一个函数增加一个型如PyObj ...
Vs 引用第三方DLL文件版本不一致问题（npoi与memcached中的ICSharpCode.SharpZipLib版本冲突的解决方案）
最近在做 MailChimp 与网站功能集成时,发现 MailChimp 2API 中的 MailChimp.dll 中的依赖项 SerivceStack.Text.dll (版本为3.9.71 ...
String中intern的方法
首先查看官方API那个的解释: ——————————————————————————————————————— intern public String intern() 返回字符串对象的规范化表示形 ...
Zen Coding support in WebStorm/PhpStorm
With the last WebStorm/PhpStorm EAP you can edit HTML and CSS code really fast usingZen Coding featu ...
css页面重构面试题
偶然间又看到博客园中这两道页面重构面试题.题一:分别用2个DIV,3个DIV,5个DIV实现水平垂直均居中显示一个宽50px,高200px的正十字架.思路:水平垂直均居中的实现,当然非absolute ...
临时禁用Resharper
Visual Studio 菜单 –> 工具 –> 选项 –> ReSharper –> Suspend按钮
Jquery Table 的基本操作
Jquery 操作 Html Table 是很方便的,这里对表格的基本操作进行一下简单的总结. 首先建立一个通用的表格css 和一个表格Table: table { border-collapse: ...
[Bhatia.Matrix Analysis.Solutions to Exercises and Problems]ExI.4.4
(1). There is a natural isomorphism between the spaces $\scrH\otimes \scrH^*$ and $\scrL(\scrH,\scrK ...

【异构计算】OpenCL矩阵转置

介绍

完整代码一：

main.cpp代码

kernel代码

测试结果输出

完整代码二：

maincpp代码

kernel代码

测试结果输出

【异构计算】OpenCL矩阵转置的更多相关文章

随机推荐

热门专题