Parallel Computing–Cannon算法 (MPI 实现)

原理不解释，直接上代码

代码中被注释的源程序可用于打印中间结果，检查运算是否正确。

#include "mpi.h"

#include <math.h>

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

void scatter_matrix(int* fstream,int n1,int n2,int*Q,int root,int tag){

	/*每个矩阵块的大小*/

	int rows=(n1+root-1)/root;

	int cols=(n2+root-1)/root;

	int* tmp_matrix=(int*)malloc(rows*cols*sizeof(int));

	int i,j;

	memset(Q,0,rows*cols*sizeof(int));

	for(i=0;i<root;i++){

		for(j=0;j<root;j++){

			int p=0,q=0;

			int imin=i*rows*n2;

			int jmin=j*cols;

			memset(tmp_matrix,0,sizeof(tmp_matrix));

			/*在划分矩阵时，由于地空间不连续，需要另开辟一个数组连续的保存起来，以便于调用MPI_Send*/

			for(p=0;p<rows;p++,imin+=n2){

				for(q=0;q<cols;q++){

					tmp_matrix[p*cols+q]=fstream[imin+jmin+q];

				}

			}

			if(i==0&&j==0){

				/*进程0 不需要使用MPI_Send将数据发送给自己，直接使用memcpy将结果拷贝即可*/

				memcpy(Q,tmp_matrix,rows*cols*sizeof(int));

			}else{

				/*将分块发送给位于i行，j列的进程*/

				MPI_Send(tmp_matrix,rows*cols,MPI_INT,i*root+j,tag,MPI_COMM_WORLD);

			}

		}

	}

}

/*

 *@row:矩阵所在的行

 *@col:矩阵所在的列

 *@sp:sp=root=sqrt(nprocs)

 *@return 根据行列号计算进程实际编号

*/

int get_index(int row,int col,int sp){

	int tmp=((row+sp)%sp)*sp+(col+sp)%sp;

	return tmp;

}

/*计算矩阵乘法，将结果存入C中*/

void matrix_multi(int* A,int *B,int *C,int n1,int n2,int n3,int myid){

	int i=0,j=0,k=0;

	int* tmp_C=(int*)malloc(n1*n3*sizeof(int));

	memset(tmp_C,0,sizeof(int)*n1*n3);

	for(i=0;i<n1;i++){

		for(j=0;j<n3;j++){

			for(k=0;k<n2;k++){

				tmp_C[i*n3+j]+=A[i*n2+k]*B[k*n3+j];

			}

			C[i*n3+j]+=tmp_C[i*n3+j];

		}

	}

}

/*用于矩阵下标定位对齐*/

void shuffle(int*A,int*buf_A,int buf_A_size,int *B,int*buf_B,int buf_B_size,int root,int myid){

	int i,j;

	MPI_Status status;

	int cur_col=0;

	int cur_row=0;

	/*通过进程编号计算获得当前进程所在的行号和列号*/

	cur_row=myid/root;

	cur_col=myid-cur_row*root;

	/*对于矩阵A，第i行的矩阵需要向左平移i次*/

	for(i=0;i<cur_row;i++){

		/*接收来自右边的数据，并将当前矩阵发送给左边的进程*/

		MPI_Sendrecv(A,buf_A_size,MPI_INT,get_index(cur_row,cur_col-1,root),102,

			     buf_A,buf_A_size,MPI_INT,get_index(cur_row,cur_col+1,root),102,MPI_COMM_WORLD,&status);

		memcpy(A,buf_A,buf_A_size*sizeof(int));/*buf_A用于通信时缓存矩阵*/

		memset(buf_A,0,buf_A_size*sizeof(int));

	}

	/*对于矩阵B，第j列的矩阵需要向上平移j次*/

	for(j=0;j<cur_col;j++){

		/*接收来自下边的数据，并将当前矩阵发送给上边的进程*/

		MPI_Sendrecv(B,buf_B_size,MPI_INT,get_index(cur_row-1,cur_col,root),103,

			     buf_B,buf_B_size,MPI_INT,get_index(cur_row+1,cur_col,root),103,MPI_COMM_WORLD,&status);

		memcpy(B,buf_B,buf_B_size*sizeof(int));/*buf_B用于通信时缓存矩阵*/

		memset(buf_B,0,buf_B_size*sizeof(int));

	}

	/*printf("I have shuffled!\n");*/

}

void cannon(int*A,int*buf_A,int buf_A_size,int *B,int*buf_B,int buf_B_size,

	int *C,int buf_C_size,int row_a,int col_a,int col_b,int root,int myid){

	MPI_Status status;

	double elapsed_time,multiply_time=0,passdata_time=0;

	int i,j;

	memset(C,0,sizeof(int)*buf_C_size);

	int cur_col=0;

	int cur_row=0;

	/*通过进程编号计算获得当前进程所在的行号和列号*/

	cur_row=myid/root;

	cur_col=myid-cur_row*root;

	for(i=0;i<root;i++){/*一共需要循环root次，root=sqrt(nprocs)*/

		elapsed_time=MPI_Wtime();

		matrix_multi(A,B,C,row_a,col_a,col_b,myid);/*计算矩阵乘法*/

		elapsed_time=MPI_Wtime()-elapsed_time;

		multiply_time+=elapsed_time;

		/*elapsed_time=MPI_Wtime();		*/

		/*接收来自右边(row,col+1)的数据，并将当前矩阵发送给左边(row,col-1)的进程*/

		MPI_Sendrecv(A,buf_A_size,MPI_INT,get_index(cur_row,cur_col-1,root),102,

			     buf_A,buf_A_size,MPI_INT,get_index(cur_row,cur_col+1,root),102,MPI_COMM_WORLD,&status);

		/*接收来自下边(row+1,col)的数据，并将当前矩阵发送给上边(row-1,col)的进程*/

		MPI_Sendrecv(B,buf_B_size,MPI_INT,get_index(cur_row-1,cur_col,root),103,

			     buf_B,buf_B_size,MPI_INT,get_index(cur_row+1,cur_col,root),103,MPI_COMM_WORLD,&status);

		/*elapsed_time=MPI_Wtime()-elapsed_time;

		passdata_time+=elapsed_time;*/

		memcpy(B,buf_B,buf_B_size*sizeof(int));/*将buf_B中的数据拷贝至B中*/

		memcpy(A,buf_A,buf_A_size*sizeof(int));/*将buf_A中的数据拷贝至A中*/

	}

	/*将计算结果发送给数组C*/

	MPI_Send(C,row_a*col_b,MPI_INT,0,104,MPI_COMM_WORLD);

	printf("proc:%d, passdata time:%lf    multiply time:%lf\n",myid,passdata_time,multiply_time);

}

void gather_matrix(int *fstream,int n1,int n3,int*C,int root,FILE*fhc){

	MPI_Status status;

	int rows=(n1+root-1)/root;

	int cols=(n3+root-1)/root;

	int* tmp_matrix=(int*)malloc(rows*cols*sizeof(int));

	int i,j;

	for(i=0;i<root;i++){

		for(j=0;j<root;j++){

			int p,q;

			int imin=i*rows*n3;

			int jmin=j*cols;

			memset(tmp_matrix,0,sizeof(tmp_matrix));

			/*接收来自各个进程的数据*/

			MPI_Recv(tmp_matrix,rows*cols,MPI_INT,i*root+j,104,MPI_COMM_WORLD,&status);

			/*printf("I am passed proc:%d \n",i*root+j);*/

			/*将接收的矩阵tmp拼接到矩阵C中去，需要按照合理顺序拼接，否则结果会出错*/

			for(p=0;p<rows;p++,imin+=n3){

				for(q=0;q<cols;q++){

					fstream[imin+jmin+q]=tmp_matrix[p*cols+q];

					/*printf("%d ",((int*)fstream)[imin+jmin+q]);*/

				}

				/*printf("\n");*/

			}

		}

	}

	/*将结果打印到文件中*/

	for(i=0;i<n1;i++){

		for(j=0;j<n3;j++){

			fprintf(fhc,"%d ",fstream[i*n3+j]);

		}

		fprintf(fhc,"\n");

	}

}

int main(int argc,char**argv){

	int myid,numprocs;

	int i,j;

	MPI_Status status;

	int root=0;

	int dim[3];

	double elapsed_time=0;

	int max_rows_a,max_cols_a,max_rows_b,max_cols_b;

	int buf_A_size,buf_B_size,buf_C_size;

	FILE* fhc;

	/*suppose A:n1*n2 ,B:n2*n3;n1,n2,n3 are read from input file*/

	int n1,n2,n3;

	/*buffer for matrix A,B,C will be shifted ,so they each have two buffer*/

	int *A,*B,*C,*buf_A,*buf_B;

	/*on proc0,buffers to cache matrix files of A,B and C*/

	int *fstream_a=NULL,*fstream_b=NULL,*fstream_c=NULL;

	MPI_Init(&argc,&argv);/*初始化*/

	MPI_Comm_rank(MPI_COMM_WORLD,&myid);/*获取当前进程ID*/

	MPI_Comm_size(MPI_COMM_WORLD,&numprocs);/*获取全部进程数量*/

	root=sqrt(numprocs);

	if(numprocs!=root*root){

		/*如果进程总数不是平方数，则结束程序*/

		printf("process number must be a squre!\n");

		exit(-1);

	}

	/*on proc0,preprocess the command line,read in file

	 for A,B and put their sizes in dim[]*/

	if(myid==0){

		FILE *file_a,*file_b,*file_c;

		int n1,n2,n3;

		int i,j;

		file_a=fopen(argv[1],"r");/*打开文件a，文件名从运行时给的参数中获得*/

		file_b=fopen(argv[2],"r");/*打开文件b，文件名从运行时给的参数中获得*/

		fscanf(file_a,"%d %d",&n1,&n2);/*从文件a中读取矩阵A的行数，列数*/

		fscanf(file_b,"%d %d",&n2,&n3);/*从文件b中读取矩阵B的行数，列数*/

		dim[0]=n1,dim[1]=n2,dim[2]=n3;

		fstream_a=(int*)malloc(n1*n2*sizeof(int));/*分配一块内存，用于将矩阵A读入内存*/

		fstream_b=(int*)malloc(n2*n3*sizeof(int));/*分配一块内存，用于将矩阵B读入内存*/

		/*printf("Yeah! I got n1=%d,n2=%d,n3=%d\n",n1,n2,n3);*/

		/*读入矩阵A，保存在fstream_a中*/

		for(i=0;i<n1;i++)

			for(j=0;j<n2;j++)

			fscanf(file_a,"%d",&((int*)fstream_a)[i*n2+j]);

		/*读入矩阵B，保存在fstream_b中*/

		for(i=0;i<n2;i++)

			for(j=0;j<n3;j++)

				fscanf(file_b,"%d",&((int*)fstream_b)[i*n3+j]);

	}

	/*将矩阵的行数，列数通过Bcast广播给所有进程*/

	MPI_Bcast(dim,3,MPI_INT,0,MPI_COMM_WORLD);

	n1=dim[0],n2=dim[1],n3=dim[2];

	/*begin new version*/

	max_rows_a=(n1+root-1)/root;/*子矩阵块A的行数*/

	max_cols_a=(n2+root-1)/root;/*子矩阵块A的列数*/

	max_rows_b=max_cols_a;      /*子矩阵块B的行数*/

	max_cols_b=(n3+root-1)/root;/*子矩阵块B的列数*/

	buf_A_size=max_rows_a*max_cols_a;/*子矩阵块A的大小*/

	buf_B_size=max_rows_b*max_cols_b;/*子矩阵块B的大小*/

	buf_C_size=max_rows_a*max_cols_b;/*子矩阵块C的大小*/

	/*给A,，buf_A,buf_B,B,C分配内存空间，其中buf_A,buf_B用于通讯中的缓存*/

	A=(int*)malloc(sizeof(int)*buf_A_size);

	buf_A=(int*)malloc(sizeof(int)*buf_A_size);

	B=(int*)malloc(sizeof(int)*buf_B_size);

	buf_B=(int*)malloc(sizeof(int)*buf_B_size);

	C=(int*)malloc(sizeof(int)*buf_C_size);

	if(A==NULL||buf_A==NULL||B==NULL||buf_B==NULL||C==NULL)

	{

		/*如果内存申请失败，就退出*/

		printf("Memory allocation failed!\n");

		exit(-1);

	}

	/*proc 0 scatter A,B to other procs in a 2D block distribution fashion*/

	if(myid==0){

		/*printf("max_rows_a:%d\n",max_rows_a);

		printf("max_cols_a:%d\n",max_cols_a);

		printf("max_rows_b:%d\n",max_rows_b);

		printf("max_cols_b:%d\n",max_cols_b);*/

		/*进程0 将矩阵A，B划分成小块，分发给其他进程*/

		scatter_matrix((int*)fstream_a,n1,n2,A,root,100);

		/*printf("I am debuging!\n");*/

		scatter_matrix((int*)fstream_b,n2,n3,B,root,101);

		/*printf("I am finding fault!\n");*/

	}else{

		/*其他进程接收来自进程0 发送的矩阵A,B*/

		MPI_Recv(A,max_rows_a*max_cols_a,MPI_INT,0,100,MPI_COMM_WORLD,&status);

		MPI_Recv(B,max_rows_b*max_cols_b,MPI_INT,0,101,MPI_COMM_WORLD,&status);

	}

	MPI_Barrier(MPI_COMM_WORLD);/*等待全部进程完成数据接收工作。*/

	/*printf("I am proc %d\n",myid);

	for(i=0;i<max_rows_a;i++){

		printf("%d:      ",myid);

		for(j=0;j<max_cols_a;j++){

			printf("%d ",A[i*max_cols_a+j]);

		}

		printf("\n");

	}

	printf("I am proc %d\n",myid);

	for(i=0;i<max_rows_b;i++){

		printf("%d:      ",myid);

		for(j=0;j<max_cols_b;j++){

			printf("%d ",B[i*max_cols_b+j]);

		}

		printf("\n");

	}*/

	/*compute C=A*B by Cannon algorithm*/

	/*矩阵块必须定位对齐，先做预处理*/

	shuffle(A,buf_A,buf_A_size,B,buf_B,buf_B_size,root,myid);

	elapsed_time=MPI_Wtime();

	/*包含cannon全部内容*/

	cannon(A,buf_A,buf_A_size,B,buf_B,buf_B_size,

		C,buf_C_size,max_rows_a,max_cols_a,max_cols_b,root,myid);

	MPI_Barrier(MPI_COMM_WORLD);

	elapsed_time=MPI_Wtime()-elapsed_time;/*统计cannon算法实际耗时*/

	MPI_Barrier(MPI_COMM_WORLD);/*等待所有进程完成cannon算法，将结果发送给进程0*/

	int fsize_c=sizeof(int)*n1*n3;

	if(myid==0){

		/*进程0创建文件写句柄，准备将计算结果写入文件中*/

		if(!(fhc=fopen(argv[3],"w"))){

			printf("Cant't open file %s\n",argv[3]);

			MPI_Finalize();

		}

		fstream_c=(int*)malloc(fsize_c);

		/*进程0 接收来自各个进程的结果矩阵，拼接成一个完整的结果，写入文件，持久化数据结果*/

		gather_matrix(fstream_c,n1,n3,C,root,fhc);

	}

	MPI_Barrier(MPI_COMM_WORLD);    /*make sure proc 0 read all it needs*/

	if(myid==0){

		int i,j;

		printf("Cannon algorithm :multiply a %d* %d with a %d*%d, use %lf(s)\n",

			n1,n2,n2,n3,elapsed_time);

		/*printf("I have finished!\n");*/

		fclose(fhc);/*关闭文件读写句柄*/

		/*释放申请的内存空间*/

		free(fstream_a);

		free(fstream_b);

		free(fstream_c);

	}

	/*释放申请的内存空间*/

	free(A);free(buf_A);

	free(B);free(buf_B);

	free(C);

	MPI_Finalize();

	return 0;

}

Parallel Computing–Cannon算法 (MPI 实现)的更多相关文章

MPI中的cannon算法
Cannon算法算法过程假设矩阵\(A,B\)和\(C\)都可以分成\(m\times m\)块矩阵,即\(A = (A_{(ij)})_{m\times m},B = (B_{(ij)})_{m ...
Introduction to Parallel Computing
Copied From:https://computing.llnl.gov/tutorials/parallel_comp/ Author: Blaise Barney, Lawrence Live ...
Parallel.For 平行算法使用
之前看到Parallel的用法,觉得很高深,很腻害,今天专门抽空研究了一下,发现还是很easy的. .NET Framework 4.0 新加的功能,所以4.0之前的无法使用哦. 下面介绍一下,Par ...
General mistakes in parallel computing
这是2013年写的一篇旧文,放在gegahost.net上面 http://raison.gegahost.net/?p=97 March 11, 2013 General mistakes in ...
Method and apparatus for an atomic operation in a parallel computing environment
A method and apparatus for a atomic operation is described. A method comprises receiving a first pro ...
MPI编程——分块矩阵乘法(cannon算法)
https://blog.csdn.net/a429367172/article/details/88933877
STROME --realtime & online parallel computing
Data Collections ---> Stream to Channel (as source input) ----> Parallel Computing---> Resu ...
Distributed and Parallel Computing
Omega Network Model
[译]何时使用 Parallel.ForEach，何时使用 PLINQ
原作者: Pamela Vagata, Parallel Computing Platform Group, Microsoft Corporation 原文pdf:http://download.c ...

随机推荐

【干货分享】JPager.Net MVC超好用轻量级分页控件
JPager.Net MVC好用的轻量级分页控件,好用到你无法想象,轻量到你无法想象. JPager.Net MVC好用的轻量级分页控件,实现非常简单,使用也非常简单. JPager.Net M ...
sdk墙内更新方法
因为GFW有“保护”,我们能“安全”的遨游在中华互联局域网内.如何快速地更新sdk,一直是Android开发者的心病.网上流传着五花八门的方法,在这我记录一些我用过的切实可行的方法供给有需要的人.同时 ...
Node 进阶：express 默认日志组件 morgan 从入门使用到源码剖析
本文摘录自个人总结<Nodejs学习笔记>,更多章节及更新,请访问 github主页地址.欢迎加群交流,群号 197339705. 章节概览 morgan是express默认的日志中间件, ...
利用PhotoShop将Font-Awesome转为图片格式
介绍如何将Font-Awesome等字体图标转换为图片格式,使用PHOTPSHOP很简单. 网上找了很多,都比较麻烦.别问为什么要这么做,因为你还没遇到需要的时候. 下载Font-Awesome字体库 ...
移动端前端UI库—Frozen UI、WeUI、SUI Mobile
[MUI]http://www.dcloud.io/ [Clouda]http://clouda.baidu.com/blend2是百度历时两年共同研发的开源App技术框架,基于Node.js,简单易 ...
Congruence relation 同余关系
https://en.wikipedia.org/wiki/Congruence_relation https://zh.wikipedia.org/wiki/%E5%90%8C%E9%A4%98%E ...
jaccard similarity coefficient 相似度计算
Jaccard index From Wikipedia, the free encyclopedia The Jaccard index, also known as the Jaccard ...
Vim块注释
如何在VIM下快速注释块代码添加块注释 01.进入视图模式 v进入视图模式,控制方向键选中注释的代码 02.进入列模式并插入# ctrl+v进入列,I插入注释# 03.全部注释 esc两次自动全部注 ...
MVC认知路【点点滴滴支离破碎】【五】----form表单上传单个文件
//个人理解:前台一个form加input[type='file'],在加一个submit的按钮主要设置form的action,method,enctype='multipart/form-data ...
killall 根据名称终止进程
根据名称终止进程 killall [option] name-list killall 将信号发送到一个或多个进程用来终止它.除超级用户外,只有进程的所有者才可以对进程执行killall,超级用户可以 ...

Parallel Computing–Cannon算法 (MPI 实现)

Parallel Computing–Cannon算法 (MPI 实现)的更多相关文章

随机推荐

热门专题