PostgreSQL Streaming Replication的FATAL ERROR

磨砺技术珠矶，践行数据之道，追求卓越价值
回到上一级页面： PostgreSQL集群方案相关索引页回到顶级页面：PostgreSQL索引页[作者高健@博客园 luckyjackgao@gmail.com]

首先，通过代码，查看调用关系:

libpqwalreceiver.c            _PG_init 関数                

/*

 * Module load callback

 */

void

_PG_init(void)

{

    /* Tell walreceiver how to reach us */

    if (walrcv_connect != NULL || walrcv_receive != NULL ||

        walrcv_send != NULL || walrcv_disconnect != NULL)

        elog(ERROR, "libpqwalreceiver already loaded");                    

    walrcv_connect = libpqrcv_connect;

    walrcv_receive = libpqrcv_receive;

    walrcv_send = libpqrcv_send;

    walrcv_disconnect = libpqrcv_disconnect;

}

再看

walreceiver.c            WalReceiverMain 関数　                    

/* libpqreceiver hooks to these when loaded */

walrcv_connect_type walrcv_connect = NULL;

walrcv_receive_type walrcv_receive = NULL;

walrcv_send_type walrcv_send = NULL;

walrcv_disconnect_type walrcv_disconnect = NULL;                                

…                                

/* Main entry point for walreceiver process */

void

WalReceiverMain(void)

{

    …

    /* Load the libpq-specific functions */

    load_file("libpqwalreceiver", false);

    if (walrcv_connect == NULL || walrcv_receive == NULL ||

        walrcv_send == NULL || walrcv_disconnect == NULL)

        elog(ERROR, "libpqwalreceiver didn't initialize correctly");

    …                            

    /* Establish the connection to the primary for XLOG streaming */

    EnableWalRcvImmediateExit();

    walrcv_connect(conninfo, startpoint);

    DisableWalRcvImmediateExit();                            

    /* Initialize LogstreamResult, reply_message and feedback_message */

    LogstreamResult.Write = LogstreamResult.Flush = GetXLogReplayRecPtr(NULL);

    MemSet(&reply_message, , sizeof(reply_message));

    MemSet(&feedback_message, , sizeof(feedback_message));                            

    /* Loop until end-of-streaming or error */

    for (;;)

    {

        …

        /* Wait a while for data to arrive */

        if (walrcv_receive(NAPTIME_PER_CYCLE, &type, &buf, &len))

        {

            /* Accept the received data, and process it */

            XLogWalRcvProcessMsg(type, buf, len);                    

            /* Receive any more data we can without sleeping */

            while (walrcv_receive(, &type, &buf, &len))

                XLogWalRcvProcessMsg(type, buf, len);                

            /* Let the master know that we received some data. */

            XLogWalRcvSendReply();                    

            /*

             * If we've written some records, flush them to disk and let the

             * startup process and primary server know about them.

             */

            XLogWalRcvFlush(false);

        }

        else

        {

            /*

             * We didn't receive anything new, but send a status update to the

             * master anyway, to report any progress in applying WAL.

             */

            XLogWalRcvSendReply();

            XLogWalRcvSendHSFeedback();

        }

    }

}

再看

 libpqwalreceiver.c            libpqrcv_receive 関数                        

/*

 * Receive a message available from XLOG stream, blocking for

 * maximum of 'timeout' ms.

 *

 * Returns:

 *

 *     True if data was received. *type, *buffer and *len are set to

 *     the type of the received data, buffer holding it, and length,

 *     respectively.

 *

 *     False if no data was available within timeout, or wait was interrupted

 *     by signal.

 *

 * The buffer returned is only valid until the next call of this function or

 * libpq_connect/disconnect.

 *

 * ereports on error.

 */

static bool

libpqrcv_receive(int timeout, unsigned char *type, char **buffer, int *len)

{

    int    rawlen;                            

    if (recvBuf != NULL)

        PQfreemem(recvBuf);

    recvBuf = NULL;                                

    /* Try to receive a CopyData message */

    rawlen = PQgetCopyData(streamConn, &recvBuf, );                                

    if (rawlen == )

    {

        /*

         * No data available yet. If the caller requested to block, wait for

         * more data to arrive.

         */

        if (timeout > )

        {

            if (!libpq_select(timeout))

                return false;

        }                            

        if (PQconsumeInput(streamConn) == )

            ereport(ERROR,

                    (errmsg("could not receive data from WAL stream: %s",

                            PQerrorMessage(streamConn))));        

        /* Now that we've consumed some input, try again */

        rawlen = PQgetCopyData(streamConn, &recvBuf, );

        if (rawlen == )

            return false;

    }                                

    if (rawlen == -)            /* end-of-streaming or error */

    {

        PGresult   *res;                            

        res = PQgetResult(streamConn);

        if (PQresultStatus(res) == PGRES_COMMAND_OK)

        {

            PQclear(res);

            ereport(ERROR,

                    (errmsg("replication terminated by primary server")));

        }

        PQclear(res);                            

        ereport(ERROR,

                (errmsg("could not receive data from WAL stream: %s",

                        PQerrorMessage(streamConn))));

    }                                

    if (rawlen < -)

        ereport(ERROR,

                (errmsg("could not receive data from WAL stream: %s",

                        PQerrorMessage(streamConn))));            

    /* Return received messages to caller */

    *type = *((unsigned char *) recvBuf);

    *buffer = recvBuf + sizeof(*type);

    *len = rawlen - sizeof(*type);                                

    return true;

}

再看：

    fe-exec.c        PQgetCopyData 関数                        

    /*

     * PQgetCopyData - read a row of data from the backend during COPY OUT

     * or COPY BOTH

     *

     * If successful, sets *buffer to point to a malloc'd row of data, and

     * returns row length (always > 0) as result.

     * Returns 0 if no row available yet (only possible if async is true),

     * -1 if end of copy (consult PQgetResult), or -2 if error (consult

     * PQerrorMessage).

     */

    int

    PQgetCopyData(PGconn *conn, char **buffer, int async)

    {

        *buffer = NULL;                /* for all failure cases */

        if (!conn)

            return -;

        if (conn->asyncStatus != PGASYNC_COPY_OUT &&

            conn->asyncStatus != PGASYNC_COPY_BOTH)

        {

            printfPQExpBuffer(&conn->errorMessage,

                              libpq_gettext("no COPY in progress\n"));

            return -;

        }

        if (PG_PROTOCOL_MAJOR(conn->pversion) >= )

            return pqGetCopyData3(conn, buffer, async);

        else

            return pqGetCopyData2(conn, buffer, async);

    }

还有这个：

    fe-exec.c        PQgetCopyData 関数                        

    /*

     * PQgetCopyData - read a row of data from the backend during COPY OUT

     * or COPY BOTH

     *

     * If successful, sets *buffer to point to a malloc'd row of data, and

     * returns row length (always > 0) as result.

     * Returns 0 if no row available yet (only possible if async is true),

     * -1 if end of copy (consult PQgetResult), or -2 if error (consult

     * PQerrorMessage).

     */

    int

    PQgetCopyData(PGconn *conn, char **buffer, int async)

    {

        *buffer = NULL;                /* for all failure cases */

        if (!conn)

            return -;

        if (conn->asyncStatus != PGASYNC_COPY_OUT &&

            conn->asyncStatus != PGASYNC_COPY_BOTH)

        {

            printfPQExpBuffer(&conn->errorMessage,

                              libpq_gettext("no COPY in progress\n"));

            return -;

        }

        if (PG_PROTOCOL_MAJOR(conn->pversion) >= )

            return pqGetCopyData3(conn, buffer, async);

        else

            return pqGetCopyData2(conn, buffer, async);

    }

事实上，从上面的逻辑，可以看到：

如果slave端读取wal，发生了错误，那么它会在循环中再次试图读取，知道成功为止。

所以，出错了不要紧。当然这种 FATAL ERROR出现，肯定是要引起重视的。

经过试验，发现有几种可能会出现错误：

错误发生原因1：如果没有采用 archive log模式，那么当master端事务任务过重，导致在线wal log很快被删除覆盖，那么slave端就会找不到相应的wal log，于是在master端和客户端都出现：

FATAL: could not receive data from WAL stream: FATAL: requested WAL segment 0000000100000000000000XX has already been removed

错误发生原因2：

如果在master端的postgresql.conf文件中，设置了 replication_timeout，但wal_receiver_status_interval 却等于零，

则经过 replication_timeout秒后，如果master和slave之间因为master不忙等原因没有通信，则master会主动把这个连接干掉。

所以此时

master端出现：LOG：terminating walsender process due to replication timeout

slave端出现： FATAL: could not receive data from WAL stream:

可能的错误发生原因3：

这可能是和 recovery.conf 中的primary_conninfo有关：

例如：

primary_conninfo = 'host=master port=5432 application_name=mypg user=postgres connect_timeout=10 keepalives_idle=10 keepalives_interval=1 keepalives_count=3'

这样，每隔10秒，为了看看当前连接是否已经失效，就要发送3个keepalive数据包，如果在1秒的时间里没有得到对方响应，那么就认为连接已经死掉。

这样，如果master端的通讯比较繁忙，可能来不及应答，这样就可能发生 FATAL: could not receive data from WAL stream: could not receive data from server: connection timeout error，目前此种情况尚未再现出来，尚需验证。

下面这段话，说明了hot-standby 的中间过程：

http://www.postgresql.org/docs/9.2/static/warm-standby.html

In standby mode, the server continuously applies WAL received from the master server. The standby server can read WAL from a WAL archive (see restore_command) or directly from the master over a TCP connection (streaming replication). The standby server will also attempt to restore any WAL found in the standby cluster's pg_xlog directory. That typically happens after a server restart, when the standby replays again WAL that was streamed from the master before the restart, but you can also manually copy files to pg_xlog at any time to have them replayed.

At startup, the standby begins by restoring all WAL available in the archive location, calling restore_command. Once it reaches the end of WAL available there and restore_command fails, it tries to restore any WAL available in the pg_xlog directory. If that fails, and streaming replication has been configured, the standby tries to connect to the primary server and start streaming WAL from the last valid record found in archive or pg_xlog. If that fails or streaming replication is not configured, or if the connection is later disconnected, the standby goes back to step 1 and tries to restore the file from the archive again. This loop of retries from the archive, pg_xlog, and via streaming replication goes on until the server is stopped or failover is triggered by a trigger file.

就是说: standby server一旦启动，就会按照 archive directory --> pg_xlog directory ---> streaming replication 的顺序来应用 wal log。

所以，单纯由于网络环境造成出错的可能性比较大。

[作者高健@博客园 luckyjackgao@gmail.com]
回到上一级页面： PostgreSQL集群方案相关索引页回到顶级页面：PostgreSQL索引页磨砺技术珠矶，践行数据之道，追求卓越价值

PostgreSQL Streaming Replication的FATAL ERROR的更多相关文章

配置PostgreSQL Streaming Replication集群
运行环境: Primary: 192.168.0.11 Standby: 192.168.0.21, 192.168.0.22 OS: CentOS 6.2 PostgreSQL: 9.1.2 版本以 ...
postgresql Streaming Replication监控与注意事项
一监控Streaming Replication集群 1 pg_stat_replication视图(主库端执行) pid Wal sender process的进程ID usesysid 执行流复制 ...
MySQL Got fatal error 1236原因和解决方法【转】
本文来自:http://blog.itpub.net/22664653/viewspace-1714269/ 一前言 MySQL 的主从复制作为一项高可用特性,用于将主库的数据同步到从库,在维护主 ...
Streaming replication slots in PostgreSQL 9.4
Streaming replication slots are a pending feature in PostgreSQL 9.4, as part of the logical changese ...
PostgreSQL的streaming replication
磨砺技术珠矶,践行数据之道,追求卓越价值回到上一级页面: PostgreSQL集群方案相关索引页回到顶级页面:PostgreSQL索引页[作者高健@博客园 luckyjackgao@gm ...
PostgreSQL 9.3 Streaming Replication 状态监控
postgresql是使用Streaming Replication来实现热备份的,热备份的作用如下: 灾难恢复高可用性负载均衡,当你使用Streaming Replication来实现热备份(h ...
MySql配置主从模式 Last_IO_Error: Fatal error: The slave I/O thread stops because master and slave have equal MySQL server UUIDs; these UUIDs must be different for replication to work.
今天在学习MyCat环境搭建的时候,在配置MySql的主从模式,发现slave在配置完毕后,配置的内容全部正确的情况下,报错了? Last_IO_Error: Fatal error: The sla ...
配置MySQL主从复制报错Last_IO_Error: Fatal error: The slave I/O thread stops because master and slave have equal MySQL server ids; these ids must be different for replication to work
配置MySQL主从复制报错 ``` Last_IO_Error: Fatal error: The slave I/O thread stops because master and slave ha ...
PostgreSQL Cascade Replication
PostgreSQL Cascade Replication node1:master:10.2.208.10:repclia(user) node2:upstreamnode:10.2.208.11 ...

随机推荐

在CentOS7上安装和使用ZooKeeper最新版本（V3.4.12）
0.ZooKeeper文档 http://zookeeper.apache.org/doc/r3.4.11/zookeeperOver.html 1.准备在CentOS7上安装zookeeper时, ...
layui渲染form表单
有时ajax请求的数据返回时,页面已经加载了,此时就无法展示ajax加载的内容,如果要局部刷新表单,则加上如下代码: layui.use('form', function() { var form = ...
swift动态库与use_frameworks
使用Dynamic 的优势: 模块化,相对于Static Library,Framework可以将模块中的函数代码外的资源文件打包在一起. 共享可执行文件 iOS 有沙箱机制,不能跨App间共享共态库 ...
[Python 练习爬虫] XPATH基础语法
XPATH语法: // 定位根标签 / 往下层寻找 /text() 提取文本内容 /@xxx 提取属性内容 Sample: import requests from lxml import etree ...
GAutomator,GAutomatorview和Android SDK,Unity配置
1. 安装和配置JDK 环境配置中: JAVA_HOME:E:\Program Files\java\jdk1.8.0_131 Path:%JAVA_HOME%\bin;%JAVA_HOME%\jre ...
Dubbo实践（十）代理
Invoker调用代理有几种方式:普通代理.JDK.Javassist库动态代理.Javassist库动态字节码代理. 生成代理的目的是你调用invoker的相关函数后,就等同于是调用DubboIn ...
disconf实践（一）Ubuntu16.04部署disconf
在企业中,随着公司业务的扩张,用户量的增大,单一节点应用无法支撑正常的业务逻辑,比较常见的现象是访问速度变慢,甚至超时,严重时可能会造成系统宕机.为了尽量减少宕机的风险,单一节点系统需要进行水平扩展, ...
PAT——1023. 组个最小数
给定数字0-9各若干个.你可以以任意顺序排列这些数字,但必须全部使用.目标是使得最后得到的数尽可能小(注意0不能做首位).例如:给定两个0,两个1,三个5,一个8,我们得到的最小的数就是1001555 ...
Struts2学习总结——文件上传与下载
Struts2文件上传与下载 1.1.1新建一个Maven项目(demo02) 在此添加Web构面以及 struts2 构面 1.2.1配置Maven依赖(pom.xml 文件) <?xml v ...
(Les17 移动数据)expdp/impdp
oracle 11.2.0 expdp/impdp 数据泵参数 expdp参数=========================================================== ...

PostgreSQL Streaming Replication的FATAL ERROR

PostgreSQL Streaming Replication的FATAL ERROR的更多相关文章

随机推荐

热门专题