OpenMPI源码剖析2：ompi_mpi_errors_are_fatal_comm

上一篇文章说道，初始化失败会有一个函数调用:

ompi_mpi_errors_are_fatal_comm_handler(NULL, NULL, message);

所以这里简单地进入了 ompi_mpi_errors_are_fatal_comm_handler 函数:
看到其头文件 errhandler_predefined.h :

#ifndef OMPI_ERRHANDLER_PREDEFINED_H

#define OMPI_ERRHANDLER_PREDEFINED_H

#include "ompi_config.h"

struct ompi_communicator_t;

struct ompi_file_t;

struct ompi_win_t;

/**

 * Handler function for MPI_ERRORS_ARE_FATAL								//---------------看到了吗？fatal_error

 */

OMPI_DECLSPEC void ompi_mpi_errors_are_fatal_comm_handler(struct ompi_communicator_t **comm,

					    int *error_code, ...);

OMPI_DECLSPEC void ompi_mpi_errors_are_fatal_file_handler(struct ompi_file_t **file,

					    int *error_code, ...);

OMPI_DECLSPEC void ompi_mpi_errors_are_fatal_win_handler(struct ompi_win_t **win,

					    int *error_code, ...);

/**

 * Handler function for MPI_ERRORS_RETURN									//---------------- error_return

 */

OMPI_DECLSPEC void ompi_mpi_errors_return_comm_handler(struct ompi_communicator_t **comm,

                                   int *error_code, ...);

OMPI_DECLSPEC void ompi_mpi_errors_return_file_handler(struct ompi_file_t **file,

                                   int *error_code, ...);

OMPI_DECLSPEC void ompi_mpi_errors_return_win_handler(struct ompi_win_t **win,

                                   int *error_code, ...);

#endif /* OMPI_ERRHANDLER_PREDEFINED_H */

　跳去它的实现文件 errhandler_predefined.c 中看对应函数:　

void ompi_mpi_errors_are_fatal_comm_handler(struct ompi_communicator_t **comm,

                                            int *error_code, ...)

{

  char *name;

  struct ompi_communicator_t *abort_comm;

  va_list arglist;

  va_start(arglist, error_code);

  if (NULL != comm) {

      name = (*comm)->c_name;

      abort_comm = *comm;

  } else {

      name = NULL;

      abort_comm = NULL;

  }

  backend_fatal("communicator", abort_comm, name, error_code, arglist);

  va_end(arglist);

}

　映入眼帘的是 ompi_communicator_t 类，MPI中的通信子应该就是通过这个类来实现的，后期要重点学习

这里涉及到不变参数列表的知识，参考此处: http://www.cnblogs.com/hanyonglu/archive/2011/05/07/2039916.html

放到此处的具体场景，也就是arglist只有message一个变量，其实也就是:

static const char FUNC_NAME[] = "MPI_Init";

这里做了一些处理之后，调用 backend_fatal 函数，这是在本文件中的一个局部函数, 它做了什么呢？ :

static void backend_fatal(char *type, struct ompi_communicator_t *comm,

                          char *name, int *error_code,

                          va_list arglist)

{

    /* We only want aggregation while the rte is initialized */

    if (ompi_rte_initialized) {

        backend_fatal_aggregate(type, comm, name, error_code, arglist);

    } else {

        backend_fatal_no_aggregate(type, comm, name, error_code, arglist);

    }

    /* In most instances the communicator will be valid. If not, we are either early in

     * the initialization or we are dealing with a window. Thus, it is good enough to abort

     * on MPI_COMM_SELF, the error will propagate.

     */

    if (comm == NULL) {

        comm = &ompi_mpi_comm_self.comm;

    }

    if (NULL != error_code) {

        ompi_mpi_abort(comm, *error_code);

    } else {

        ompi_mpi_abort(comm, 1);

    }

}

　　backend_fatal_aggregate 函数是和聚合有关的，我们先跳过。————留下疑点1

因为我们传入的 error_code 参数为NULL，最后程序会进入 ompi_mpi_abort(comm, 1);

它在 mpiruntime.h 头文件中，实现在 ompi_mpi_abort.c 文件中，贴上它的函数定义代码:

// 目前传入的参数是:   &ompi_mpi_comm_self.comm, 1
int

ompi_mpi_abort(struct ompi_communicator_t* comm,

               int errcode)

{
　　// 1. 我们要看的第一部分代码： 防止递归，获取节点名称

    char *host, hostname[OPAL_MAXHOSTNAMELEN];

    pid_t pid = 0;

    /* Protection for recursive invocation */

    if (have_been_invoked) {

        return OMPI_SUCCESS;

    }

    have_been_invoked = true;

    /* If MPI is initialized, we know we have a runtime nodename, so

       use that.  Otherwise, call gethostname. */
　　// 疑问2: rte到底是什么？这个估计后续的深入了解会接触的更多,可能是runtime environment

    if (ompi_rte_initialized) {
　　　　// host代表的也就是rank, 存储在 ompi_process_info 这个结构体中， ————疑问3

        host = ompi_process_info.nodename;

    } else {

        gethostname(hostname, sizeof(hostname));

        host = hostname;

    }

    pid = getpid();


　  // 2. 我们要看的第二部分代码: 打印函数调用堆栈

    /* Should we print a stack trace?  Not aggregated because they

       might be different on all processes. */

    if (opal_abort_print_stack) {

        char **messages;

        int len, i;

        if (OPAL_SUCCESS == opal_backtrace_buffer(&messages, &len)) {

            for (i = 0; i < len; ++i) {

                fprintf(stderr, "[%s:%d] [%d] func:%s\n", host, (int) pid,

                        i, messages[i]);

                fflush(stderr);

            }

            free(messages);

        } else {

            /* This will print an message if it's unable to print the

               backtrace, so we don't need an additional "else" clause

               if opal_backtrace_print() is not supported. */

            opal_backtrace_print(stderr, NULL, 1);

        }

    }


　　// 3. 第三部分代码: abort之前的自旋等待

    /* Should we wait for a while before aborting? */

    if (0 != opal_abort_delay) {

        if (opal_abort_delay < 0) {

            fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter opal_abort_delay is < 0)\n",

                    host, (int) pid);

            fflush(stderr);

            while (1) {

                sleep(5);

            }

        } else {

            fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n",

                    host, (int) pid, opal_abort_delay);

            do {

                sleep(1);

            } while (--opal_abort_delay > 0);

        }

    }

　　 
　　// 4. 第四部分: RTE未初始化的情况下，看ompi_mpi_finalized的情况是哪种

    /* If the RTE isn't setup yet/any more, then don't even try

       killing everyone.  Sorry, Charlie... */

    if (!ompi_rte_initialized) {

        fprintf(stderr, "[%s:%d] Local abort %s completed successfully, but am not able to aggregate error messages, and not able to guarantee that all other processes were killed!\n",

                host, (int) pid, ompi_mpi_finalized ?

                "after MPI_FINALIZE started" : "before MPI_INIT completed");

        _exit(errcode == 0 ? 1 : errcode);

    }


　　 // 5.第五部分: 有了communicator, 干掉进程集————疑问4:那是不是说，如果有一个进程初始化失败，整个进程集都会挂掉呢？

    /* If OMPI is initialized and we have a non-NULL communicator,

       then try to kill just that set of processes */

    if (ompi_mpi_initialized && !ompi_mpi_finalized && NULL != comm) {

        try_kill_peers(comm, errcode);　　　　　　// 疑问 5

    }


　　// 6. 第六部分: 很少情况会执行到这里，abort运行环境

    /* We can fall through to here in a few cases:

       1. The attempt to kill just a subset of peers via

          try_kill_peers() failed (e.g., as of July 2014, ORTE does

          returns NOT_IMPLENTED from orte_rte_abort_peers()).

       2. MPI wasn't initialized, was already finalized, or we got a

          NULL communicator.

       In all of these cases, the only sensible thing left to do is to

       kill the entire job.  Wah wah. */

    ompi_rte_abort(errcode, NULL);　　　　　　　　//疑问 6

    /* Does not return */

}

为了章节结构好看，本篇就到此，留下6个疑问，再次重申一下:

1. backend_fatal_aggregate 函数

2: rte到底是什么？这个估计后续的深入了解会接触的更多,可能是runtime environment

3. host代表的也就是rank, 存储在 ompi_process_info 这个结构体中

4.那是不是说，如果有一个进程初始化失败，整个进程集都会挂掉呢？

5. try_kill_peers 函数

6. ompi_rte_abort 函数

OpenMPI源码剖析2：ompi_mpi_errors_are_fatal_comm_handler函数的更多相关文章

OpenMPI源码剖析1：MPI_Init初探
OpenMPI的底层实现: 我们知道,OpenMPI应用起来还是比较简单的,但是如果让我自己来实现一个MPI的并行计算,你会怎么设计呢?————这就涉及到比较底层的东西了. 回想起我们最简单的代码,通 ...
5.2【Linux 内核网络协议栈源码剖析】socket 函数剖析 ☆☆☆
深度剖析网络协议栈中的 socket 函数,可以说是把前面介绍的串联起来,将网络协议栈各层关联起来. 应用层 FTP SMTP HTTP ... 传输层 TCP UDP 网络层 IP ICMP ARP ...
OpenMPI源码剖析：网络通信原理(二) 如何选择网络协议?
因为比较常用的是 TCP 协议,所以在 opal/mca/btl/tcp/btl_tcp.h 头文件中找到对应的 struct mca_btl_tcp_component_t { mca_btl_ba ...
OpenMPI源码剖析3：try_kill_peers 和 ompi_rte_abort 函数
接着上一篇的疑问,我们说道,会执行 try_kill_peers 函数,它的函数定义在 ompi_mpi_abort.c 下: // 这里注释也说到了,主要是杀死在同一个communicator的进程 ...
OpenMPI源码剖析：网络通信原理（一）
MPI中的网络通信的原理,需要解决以下几个问题: 1. MPI使用什么网络协议进行通信? 2.中央数据库是存储在哪一台机器上? 3.集群中如果有一台机器挂掉了是否会影响其他机器? 参考: https: ...
OpenMPI源码剖析4：rte.h 头文件的说明信息
上一篇文章中说道,我们在 rte.h 中发现了有价值的说明: 我们一块一块来分析,首先看到第一块,关于 Process name Object: * (a) Process name objects ...
STL源码剖析之_allocate函数
SGI STL提供的标准std::allocator中的_allocate函数代码如下: template<class T> inline T* _allocate(ptrdiff_t s ...
菜鸟nginx源码剖析框架篇（一）从main函数看nginx启动流程（转）
俗话说的好,牵牛要牵牛鼻子驾车顶牛,处理复杂的东西,只要抓住重点,才能理清脉络,不至于深陷其中,不能自拔.对复杂的nginx而言,main函数就是“牛之鼻”,只要能理清main函数,就一定能理解其中 ...
c++ stl源码剖析学习笔记(一)uninitialized_copy()函数
template <class InputIterator, class ForwardIterator>inline ForwardIterator uninitialized_copy ...

随机推荐

Spring - 父容器与子容器
一.Spring容器(父容器) 1.Mapper代理对象 2.Service对象二.Springmvc(前端控制器)(子容器) Controller对象 1.标准的配置是这样的:Con ...
unittest单元测试框架之unittest案例（二）
1.待测方法: # 加法,返回 a+b 的值 def add(a,b): return a+b # 减法,返回 a-b 的值 def minus(a,b): return a-b # 乘法,返回 a* ...
Docker 三种UI管理平台
docker集中化web管理平台一.shipyard 1.启动docker,下载镜像 # systemctl restart docker # docker pull alpine # docker ...
DML-删除
方式一:使用delete一.删除单表的记录★语法:delete from 表名 [where 筛选条件][limit 条目数]二.级联删除[补充]语法:delete 别名1,别名2 from 表1 别 ...
Oracle 体系结构三后台进程
实例后台进程在启动实例时启动,在终止实例时终止运行. SMON SMON(system monitor)起初的任务是安装和打开数据.SMON通过查找和验证数据库控制文件来安装数据库.此后,它通过查找和 ...
Shader Optimization Tips
Author : http://www.cnblogs.com/open-coder/p/3982999.html During the last few months, I have been wo ...
模板——最小生成树kruskal算法+并查集数据结构
并查集:找祖先并更新,注意路径压缩,不然会时间复杂度巨大导致出错/超时合并:(我的祖先是的你的祖先的父亲) 找父亲:(初始化祖先是自己的,自己就是祖先) 查询:(我们是不是同一祖先) 路径压缩:(每 ...
laravel form 表单提交
form表单需要加token,不然会出现419错误,csrf_token不用自己生成,放进去就行,laravel自己会生成路由: 控制器生成一个:
DOM中的事件傳播機制
要講到事件傳播機制之前,首先要瞭解的是什麼是事件? 事件,發生在靜態頁面與動態行為之間的交互行為.是JavaScript 和 HTML的交互是通过事件实现的.比如,按鈕的點擊,鼠標的滑過,鍵盤的輸入 ...
matlab读取txt文本
textread函数原文:http://blog.sina.com.cn/s/blog_618af1950100k926.html 文档:https://ww2.mathworks.cn/help/ ...

OpenMPI源码剖析2：ompi_mpi_errors_are_fatal_comm_handler函数

OpenMPI源码剖析2：ompi_mpi_errors_are_fatal_comm_handler函数的更多相关文章

随机推荐

热门专题