PostgreSQL Streaming Replication的FATAL ERROR

磨砺技术珠矶，践行数据之道，追求卓越价值
回到上一级页面： PostgreSQL集群方案相关索引页回到顶级页面：PostgreSQL索引页[作者高健@博客园 luckyjackgao@gmail.com]

首先，通过代码，查看调用关系:

libpqwalreceiver.c            _PG_init 関数                

/*

 * Module load callback

 */

void

_PG_init(void)

{

    /* Tell walreceiver how to reach us */

    if (walrcv_connect != NULL || walrcv_receive != NULL ||

        walrcv_send != NULL || walrcv_disconnect != NULL)

        elog(ERROR, "libpqwalreceiver already loaded");                    

    walrcv_connect = libpqrcv_connect;

    walrcv_receive = libpqrcv_receive;

    walrcv_send = libpqrcv_send;

    walrcv_disconnect = libpqrcv_disconnect;

}

再看

walreceiver.c            WalReceiverMain 関数　                    

/* libpqreceiver hooks to these when loaded */

walrcv_connect_type walrcv_connect = NULL;

walrcv_receive_type walrcv_receive = NULL;

walrcv_send_type walrcv_send = NULL;

walrcv_disconnect_type walrcv_disconnect = NULL;                                

…                                

/* Main entry point for walreceiver process */

void

WalReceiverMain(void)

{

    …

    /* Load the libpq-specific functions */

    load_file("libpqwalreceiver", false);

    if (walrcv_connect == NULL || walrcv_receive == NULL ||

        walrcv_send == NULL || walrcv_disconnect == NULL)

        elog(ERROR, "libpqwalreceiver didn't initialize correctly");

    …                            

    /* Establish the connection to the primary for XLOG streaming */

    EnableWalRcvImmediateExit();

    walrcv_connect(conninfo, startpoint);

    DisableWalRcvImmediateExit();                            

    /* Initialize LogstreamResult, reply_message and feedback_message */

    LogstreamResult.Write = LogstreamResult.Flush = GetXLogReplayRecPtr(NULL);

    MemSet(&reply_message, , sizeof(reply_message));

    MemSet(&feedback_message, , sizeof(feedback_message));                            

    /* Loop until end-of-streaming or error */

    for (;;)

    {

        …

        /* Wait a while for data to arrive */

        if (walrcv_receive(NAPTIME_PER_CYCLE, &type, &buf, &len))

        {

            /* Accept the received data, and process it */

            XLogWalRcvProcessMsg(type, buf, len);                    

            /* Receive any more data we can without sleeping */

            while (walrcv_receive(, &type, &buf, &len))

                XLogWalRcvProcessMsg(type, buf, len);                

            /* Let the master know that we received some data. */

            XLogWalRcvSendReply();                    

            /*

             * If we've written some records, flush them to disk and let the

             * startup process and primary server know about them.

             */

            XLogWalRcvFlush(false);

        }

        else

        {

            /*

             * We didn't receive anything new, but send a status update to the

             * master anyway, to report any progress in applying WAL.

             */

            XLogWalRcvSendReply();

            XLogWalRcvSendHSFeedback();

        }

    }

}

再看

 libpqwalreceiver.c            libpqrcv_receive 関数                        

/*

 * Receive a message available from XLOG stream, blocking for

 * maximum of 'timeout' ms.

 *

 * Returns:

 *

 *     True if data was received. *type, *buffer and *len are set to

 *     the type of the received data, buffer holding it, and length,

 *     respectively.

 *

 *     False if no data was available within timeout, or wait was interrupted

 *     by signal.

 *

 * The buffer returned is only valid until the next call of this function or

 * libpq_connect/disconnect.

 *

 * ereports on error.

 */

static bool

libpqrcv_receive(int timeout, unsigned char *type, char **buffer, int *len)

{

    int    rawlen;                            

    if (recvBuf != NULL)

        PQfreemem(recvBuf);

    recvBuf = NULL;                                

    /* Try to receive a CopyData message */

    rawlen = PQgetCopyData(streamConn, &recvBuf, );                                

    if (rawlen == )

    {

        /*

         * No data available yet. If the caller requested to block, wait for

         * more data to arrive.

         */

        if (timeout > )

        {

            if (!libpq_select(timeout))

                return false;

        }                            

        if (PQconsumeInput(streamConn) == )

            ereport(ERROR,

                    (errmsg("could not receive data from WAL stream: %s",

                            PQerrorMessage(streamConn))));        

        /* Now that we've consumed some input, try again */

        rawlen = PQgetCopyData(streamConn, &recvBuf, );

        if (rawlen == )

            return false;

    }                                

    if (rawlen == -)            /* end-of-streaming or error */

    {

        PGresult   *res;                            

        res = PQgetResult(streamConn);

        if (PQresultStatus(res) == PGRES_COMMAND_OK)

        {

            PQclear(res);

            ereport(ERROR,

                    (errmsg("replication terminated by primary server")));

        }

        PQclear(res);                            

        ereport(ERROR,

                (errmsg("could not receive data from WAL stream: %s",

                        PQerrorMessage(streamConn))));

    }                                

    if (rawlen < -)

        ereport(ERROR,

                (errmsg("could not receive data from WAL stream: %s",

                        PQerrorMessage(streamConn))));            

    /* Return received messages to caller */

    *type = *((unsigned char *) recvBuf);

    *buffer = recvBuf + sizeof(*type);

    *len = rawlen - sizeof(*type);                                

    return true;

}

再看：

    fe-exec.c        PQgetCopyData 関数                        

    /*

     * PQgetCopyData - read a row of data from the backend during COPY OUT

     * or COPY BOTH

     *

     * If successful, sets *buffer to point to a malloc'd row of data, and

     * returns row length (always > 0) as result.

     * Returns 0 if no row available yet (only possible if async is true),

     * -1 if end of copy (consult PQgetResult), or -2 if error (consult

     * PQerrorMessage).

     */

    int

    PQgetCopyData(PGconn *conn, char **buffer, int async)

    {

        *buffer = NULL;                /* for all failure cases */

        if (!conn)

            return -;

        if (conn->asyncStatus != PGASYNC_COPY_OUT &&

            conn->asyncStatus != PGASYNC_COPY_BOTH)

        {

            printfPQExpBuffer(&conn->errorMessage,

                              libpq_gettext("no COPY in progress\n"));

            return -;

        }

        if (PG_PROTOCOL_MAJOR(conn->pversion) >= )

            return pqGetCopyData3(conn, buffer, async);

        else

            return pqGetCopyData2(conn, buffer, async);

    }

还有这个：

    fe-exec.c        PQgetCopyData 関数                        

    /*

     * PQgetCopyData - read a row of data from the backend during COPY OUT

     * or COPY BOTH

     *

     * If successful, sets *buffer to point to a malloc'd row of data, and

     * returns row length (always > 0) as result.

     * Returns 0 if no row available yet (only possible if async is true),

     * -1 if end of copy (consult PQgetResult), or -2 if error (consult

     * PQerrorMessage).

     */

    int

    PQgetCopyData(PGconn *conn, char **buffer, int async)

    {

        *buffer = NULL;                /* for all failure cases */

        if (!conn)

            return -;

        if (conn->asyncStatus != PGASYNC_COPY_OUT &&

            conn->asyncStatus != PGASYNC_COPY_BOTH)

        {

            printfPQExpBuffer(&conn->errorMessage,

                              libpq_gettext("no COPY in progress\n"));

            return -;

        }

        if (PG_PROTOCOL_MAJOR(conn->pversion) >= )

            return pqGetCopyData3(conn, buffer, async);

        else

            return pqGetCopyData2(conn, buffer, async);

    }

事实上，从上面的逻辑，可以看到：

如果slave端读取wal，发生了错误，那么它会在循环中再次试图读取，知道成功为止。

所以，出错了不要紧。当然这种 FATAL ERROR出现，肯定是要引起重视的。

经过试验，发现有几种可能会出现错误：

错误发生原因1：如果没有采用 archive log模式，那么当master端事务任务过重，导致在线wal log很快被删除覆盖，那么slave端就会找不到相应的wal log，于是在master端和客户端都出现：

FATAL: could not receive data from WAL stream: FATAL: requested WAL segment 0000000100000000000000XX has already been removed

错误发生原因2：

如果在master端的postgresql.conf文件中，设置了 replication_timeout，但wal_receiver_status_interval 却等于零，

则经过 replication_timeout秒后，如果master和slave之间因为master不忙等原因没有通信，则master会主动把这个连接干掉。

所以此时

master端出现：LOG：terminating walsender process due to replication timeout

slave端出现： FATAL: could not receive data from WAL stream:

可能的错误发生原因3：

这可能是和 recovery.conf 中的primary_conninfo有关：

例如：

primary_conninfo = 'host=master port=5432 application_name=mypg user=postgres connect_timeout=10 keepalives_idle=10 keepalives_interval=1 keepalives_count=3'

这样，每隔10秒，为了看看当前连接是否已经失效，就要发送3个keepalive数据包，如果在1秒的时间里没有得到对方响应，那么就认为连接已经死掉。

这样，如果master端的通讯比较繁忙，可能来不及应答，这样就可能发生 FATAL: could not receive data from WAL stream: could not receive data from server: connection timeout error，目前此种情况尚未再现出来，尚需验证。

下面这段话，说明了hot-standby 的中间过程：

http://www.postgresql.org/docs/9.2/static/warm-standby.html

In standby mode, the server continuously applies WAL received from the master server. The standby server can read WAL from a WAL archive (see restore_command) or directly from the master over a TCP connection (streaming replication). The standby server will also attempt to restore any WAL found in the standby cluster's pg_xlog directory. That typically happens after a server restart, when the standby replays again WAL that was streamed from the master before the restart, but you can also manually copy files to pg_xlog at any time to have them replayed.

At startup, the standby begins by restoring all WAL available in the archive location, calling restore_command. Once it reaches the end of WAL available there and restore_command fails, it tries to restore any WAL available in the pg_xlog directory. If that fails, and streaming replication has been configured, the standby tries to connect to the primary server and start streaming WAL from the last valid record found in archive or pg_xlog. If that fails or streaming replication is not configured, or if the connection is later disconnected, the standby goes back to step 1 and tries to restore the file from the archive again. This loop of retries from the archive, pg_xlog, and via streaming replication goes on until the server is stopped or failover is triggered by a trigger file.

就是说: standby server一旦启动，就会按照 archive directory --> pg_xlog directory ---> streaming replication 的顺序来应用 wal log。

所以，单纯由于网络环境造成出错的可能性比较大。

[作者高健@博客园 luckyjackgao@gmail.com]
回到上一级页面： PostgreSQL集群方案相关索引页回到顶级页面：PostgreSQL索引页磨砺技术珠矶，践行数据之道，追求卓越价值

PostgreSQL Streaming Replication的FATAL ERROR的更多相关文章

配置PostgreSQL Streaming Replication集群
运行环境: Primary: 192.168.0.11 Standby: 192.168.0.21, 192.168.0.22 OS: CentOS 6.2 PostgreSQL: 9.1.2 版本以 ...
postgresql Streaming Replication监控与注意事项
一监控Streaming Replication集群 1 pg_stat_replication视图(主库端执行) pid Wal sender process的进程ID usesysid 执行流复制 ...
MySQL Got fatal error 1236原因和解决方法【转】
本文来自:http://blog.itpub.net/22664653/viewspace-1714269/ 一前言 MySQL 的主从复制作为一项高可用特性,用于将主库的数据同步到从库,在维护主 ...
Streaming replication slots in PostgreSQL 9.4
Streaming replication slots are a pending feature in PostgreSQL 9.4, as part of the logical changese ...
PostgreSQL的streaming replication
磨砺技术珠矶,践行数据之道,追求卓越价值回到上一级页面: PostgreSQL集群方案相关索引页回到顶级页面:PostgreSQL索引页[作者高健@博客园 luckyjackgao@gm ...
PostgreSQL 9.3 Streaming Replication 状态监控
postgresql是使用Streaming Replication来实现热备份的,热备份的作用如下: 灾难恢复高可用性负载均衡,当你使用Streaming Replication来实现热备份(h ...
MySql配置主从模式 Last_IO_Error: Fatal error: The slave I/O thread stops because master and slave have equal MySQL server UUIDs; these UUIDs must be different for replication to work.
今天在学习MyCat环境搭建的时候,在配置MySql的主从模式,发现slave在配置完毕后,配置的内容全部正确的情况下,报错了? Last_IO_Error: Fatal error: The sla ...
配置MySQL主从复制报错Last_IO_Error: Fatal error: The slave I/O thread stops because master and slave have equal MySQL server ids; these ids must be different for replication to work
配置MySQL主从复制报错 ``` Last_IO_Error: Fatal error: The slave I/O thread stops because master and slave ha ...
PostgreSQL Cascade Replication
PostgreSQL Cascade Replication node1:master:10.2.208.10:repclia(user) node2:upstreamnode:10.2.208.11 ...

随机推荐

Python ，pickle
@Python pickle模块学习 pickle提供了一个简单的持久化功能.可以将对象以文件的形式存放在磁盘上. ---------------------------------------- ...
python with原型
@Python 的 with 语句详解这篇文章主要介绍了Python 的 with 语句,本文详细讲解了with语句.with语句的历史.with语句的使用例子等,需要的朋友可以参考下一. ...
坑之OJ-玄学、不可抗力
自家学校OJ网站上的题目,很玄学,不知道哪里出的问题. 这个OJ链接的题目没有问题的. https://www.luogu.org/problemnew/show/P1981 #include < ...
3、Dubbo-环境搭建
官方推荐使用 Zookeeper 注册中心 3.1).[windows]-安装zookeeper 开发中均在Linux中安装!!! 1.下载zookeeper 网址 https://archive.a ...
改变random.seed()种子值，获取不同的随机值
random.seed() random.seed()是随机数种子,也就是为随机数提供算法,完全相同的种子产生的随机数列是相同的, 所以如果想产生不同的随机数就需要用当前时间作为种子一般情况下see ...
Elasticsearch + Elasticsearch-head搭建
Elasticsearch搭建: [root@hdoop3 elk]# tar -xvf elasticsearch-6.2.4.tar [root@hdoop3 elk]# cd elasticse ...
Tomcat处理请求流程
Connector组件的Acceptor监听客户端套接字连接并接收Socket. 将连接交给线程池Executor处理,开始执行请求响应任务. Processor组件读取消息报文,解析请求行.请求体. ...
PAT——1056. 组合数的和
给定N个非0的个位数字,用其中任意2个数字都可以组合成1个2位的数字.要求所有可能组合出来的2位数字的和.例如给定2.5.8,则可以组合出:25.28.52.58.82.85,它们的和为330. 输入 ...
PAT——1027. 打印沙漏
本题要求你写个程序把给定的符号打印成沙漏的形状.例如给定17个“*”,要求按下列格式打印 ***** *** * *** ***** 所谓“沙漏形状”,是指每行输出奇数个符号:各行符号中心对齐:相邻两 ...
Xcode 5.1安装插件：规范凝视生成器VVDocumenter
类似java的多行凝视! 安装过程: 1.前往GitHub下载project文件:VVDocumenter-Xcode 2.用Xcode打开project,Command + B Build成功后,能 ...

PostgreSQL Streaming Replication的FATAL ERROR

PostgreSQL Streaming Replication的FATAL ERROR的更多相关文章

随机推荐

热门专题