PostgreSQL Streaming Replication的FATAL ERROR
磨砺技术珠矶,践行数据之道,追求卓越价值
回到上一级页面: PostgreSQL集群方案相关索引页 回到顶级页面:PostgreSQL索引页[作者 高健@博客园 luckyjackgao@gmail.com]
首先,通过代码,查看调用关系:
libpqwalreceiver.c _PG_init 関数 /*
* Module load callback
*/
void
_PG_init(void)
{
/* Tell walreceiver how to reach us */
if (walrcv_connect != NULL || walrcv_receive != NULL ||
walrcv_send != NULL || walrcv_disconnect != NULL)
elog(ERROR, "libpqwalreceiver already loaded"); walrcv_connect = libpqrcv_connect;
walrcv_receive = libpqrcv_receive;
walrcv_send = libpqrcv_send;
walrcv_disconnect = libpqrcv_disconnect;
}
再看
walreceiver.c WalReceiverMain 関数 /* libpqreceiver hooks to these when loaded */
walrcv_connect_type walrcv_connect = NULL;
walrcv_receive_type walrcv_receive = NULL;
walrcv_send_type walrcv_send = NULL;
walrcv_disconnect_type walrcv_disconnect = NULL; … /* Main entry point for walreceiver process */
void
WalReceiverMain(void)
{
…
/* Load the libpq-specific functions */
load_file("libpqwalreceiver", false);
if (walrcv_connect == NULL || walrcv_receive == NULL ||
walrcv_send == NULL || walrcv_disconnect == NULL)
elog(ERROR, "libpqwalreceiver didn't initialize correctly");
… /* Establish the connection to the primary for XLOG streaming */
EnableWalRcvImmediateExit();
walrcv_connect(conninfo, startpoint);
DisableWalRcvImmediateExit(); /* Initialize LogstreamResult, reply_message and feedback_message */
LogstreamResult.Write = LogstreamResult.Flush = GetXLogReplayRecPtr(NULL);
MemSet(&reply_message, , sizeof(reply_message));
MemSet(&feedback_message, , sizeof(feedback_message)); /* Loop until end-of-streaming or error */
for (;;)
{
…
/* Wait a while for data to arrive */
if (walrcv_receive(NAPTIME_PER_CYCLE, &type, &buf, &len))
{
/* Accept the received data, and process it */
XLogWalRcvProcessMsg(type, buf, len); /* Receive any more data we can without sleeping */
while (walrcv_receive(, &type, &buf, &len))
XLogWalRcvProcessMsg(type, buf, len); /* Let the master know that we received some data. */
XLogWalRcvSendReply(); /*
* If we've written some records, flush them to disk and let the
* startup process and primary server know about them.
*/
XLogWalRcvFlush(false);
}
else
{
/*
* We didn't receive anything new, but send a status update to the
* master anyway, to report any progress in applying WAL.
*/
XLogWalRcvSendReply();
XLogWalRcvSendHSFeedback();
}
}
}
再看
libpqwalreceiver.c libpqrcv_receive 関数 /*
* Receive a message available from XLOG stream, blocking for
* maximum of 'timeout' ms.
*
* Returns:
*
* True if data was received. *type, *buffer and *len are set to
* the type of the received data, buffer holding it, and length,
* respectively.
*
* False if no data was available within timeout, or wait was interrupted
* by signal.
*
* The buffer returned is only valid until the next call of this function or
* libpq_connect/disconnect.
*
* ereports on error.
*/
static bool
libpqrcv_receive(int timeout, unsigned char *type, char **buffer, int *len)
{
int rawlen; if (recvBuf != NULL)
PQfreemem(recvBuf);
recvBuf = NULL; /* Try to receive a CopyData message */
rawlen = PQgetCopyData(streamConn, &recvBuf, ); if (rawlen == )
{
/*
* No data available yet. If the caller requested to block, wait for
* more data to arrive.
*/
if (timeout > )
{
if (!libpq_select(timeout))
return false;
} if (PQconsumeInput(streamConn) == )
ereport(ERROR,
(errmsg("could not receive data from WAL stream: %s",
PQerrorMessage(streamConn)))); /* Now that we've consumed some input, try again */
rawlen = PQgetCopyData(streamConn, &recvBuf, );
if (rawlen == )
return false;
} if (rawlen == -) /* end-of-streaming or error */
{
PGresult *res; res = PQgetResult(streamConn);
if (PQresultStatus(res) == PGRES_COMMAND_OK)
{
PQclear(res);
ereport(ERROR,
(errmsg("replication terminated by primary server")));
}
PQclear(res); ereport(ERROR,
(errmsg("could not receive data from WAL stream: %s",
PQerrorMessage(streamConn))));
} if (rawlen < -)
ereport(ERROR,
(errmsg("could not receive data from WAL stream: %s",
PQerrorMessage(streamConn)))); /* Return received messages to caller */
*type = *((unsigned char *) recvBuf);
*buffer = recvBuf + sizeof(*type);
*len = rawlen - sizeof(*type); return true;
}
再看:
fe-exec.c PQgetCopyData 関数
/*
* PQgetCopyData - read a row of data from the backend during COPY OUT
* or COPY BOTH
*
* If successful, sets *buffer to point to a malloc'd row of data, and
* returns row length (always > 0) as result.
* Returns 0 if no row available yet (only possible if async is true),
* -1 if end of copy (consult PQgetResult), or -2 if error (consult
* PQerrorMessage).
*/
int
PQgetCopyData(PGconn *conn, char **buffer, int async)
{
*buffer = NULL; /* for all failure cases */
if (!conn)
return -;
if (conn->asyncStatus != PGASYNC_COPY_OUT &&
conn->asyncStatus != PGASYNC_COPY_BOTH)
{
printfPQExpBuffer(&conn->errorMessage,
libpq_gettext("no COPY in progress\n"));
return -;
}
if (PG_PROTOCOL_MAJOR(conn->pversion) >= )
return pqGetCopyData3(conn, buffer, async);
else
return pqGetCopyData2(conn, buffer, async);
}
还有这个:
fe-exec.c PQgetCopyData 関数
/*
* PQgetCopyData - read a row of data from the backend during COPY OUT
* or COPY BOTH
*
* If successful, sets *buffer to point to a malloc'd row of data, and
* returns row length (always > 0) as result.
* Returns 0 if no row available yet (only possible if async is true),
* -1 if end of copy (consult PQgetResult), or -2 if error (consult
* PQerrorMessage).
*/
int
PQgetCopyData(PGconn *conn, char **buffer, int async)
{
*buffer = NULL; /* for all failure cases */
if (!conn)
return -;
if (conn->asyncStatus != PGASYNC_COPY_OUT &&
conn->asyncStatus != PGASYNC_COPY_BOTH)
{
printfPQExpBuffer(&conn->errorMessage,
libpq_gettext("no COPY in progress\n"));
return -;
}
if (PG_PROTOCOL_MAJOR(conn->pversion) >= )
return pqGetCopyData3(conn, buffer, async);
else
return pqGetCopyData2(conn, buffer, async);
}
事实上,从上面的逻辑,可以看到:
如果slave端读取wal,发生了错误,那么它会在循环中再次试图读取,知道成功为止。
所以,出错了不要紧。当然这种 FATAL ERROR出现,肯定是要引起重视的。
经过试验,发现有几种可能会出现错误:
错误发生原因1:如果没有采用 archive log模式,那么当master端事务任务过重,导致在线wal log很快被删除覆盖,那么slave端就会找不到相应的wal log,于是在master端和客户端都出现:
FATAL: could not receive data from WAL stream: FATAL: requested WAL segment 0000000100000000000000XX has already been removed
错误发生原因2:
如果在master端的postgresql.conf文件中,设置了 replication_timeout,但wal_receiver_status_interval 却等于零,
则经过 replication_timeout秒后,如果master和slave之间因为master不忙等原因没有通信,则master会主动把这个连接干掉。
所以此时
master端出现:LOG:terminating walsender process due to replication timeout
slave端出现: FATAL: could not receive data from WAL stream:
可能的错误发生原因3:
这可能是和 recovery.conf 中的primary_conninfo有关:
例如:
primary_conninfo = 'host=master port=5432 application_name=mypg user=postgres connect_timeout=10 keepalives_idle=10 keepalives_interval=1 keepalives_count=3'
这样,每隔10秒,为了看看当前连接是否已经失效,就要发送3个keepalive数据包,如果在1秒的时间里没有得到对方响应,那么就认为连接已经死掉。
这样,如果master端的通讯比较繁忙,可能来不及应答,这样就可能发生 FATAL: could not receive data from WAL stream: could not receive data from server: connection timeout error,目前此种情况尚未再现出来,尚需验证。
下面这段话,说明了hot-standby 的中间过程:
http://www.postgresql.org/docs/9.2/static/warm-standby.html
In standby mode, the server continuously applies WAL received from the master server. The standby server can read WAL from a WAL archive (see restore_command) or directly from the master over a TCP connection (streaming replication). The standby server will also attempt to restore any WAL found in the standby cluster's pg_xlog directory. That typically happens after a server restart, when the standby replays again WAL that was streamed from the master before the restart, but you can also manually copy files to pg_xlog at any time to have them replayed. At startup, the standby begins by restoring all WAL available in the archive location, calling restore_command. Once it reaches the end of WAL available there and restore_command fails, it tries to restore any WAL available in the pg_xlog directory. If that fails, and streaming replication has been configured, the standby tries to connect to the primary server and start streaming WAL from the last valid record found in archive or pg_xlog. If that fails or streaming replication is not configured, or if the connection is later disconnected, the standby goes back to step 1 and tries to restore the file from the archive again. This loop of retries from the archive, pg_xlog, and via streaming replication goes on until the server is stopped or failover is triggered by a trigger file.
就是说: standby server一旦启动,就会按照 archive directory --> pg_xlog directory ---> streaming replication 的顺序来应用 wal log。
所以,单纯由于网络环境造成出错的可能性比较大。
[作者 高健@博客园 luckyjackgao@gmail.com]
回到上一级页面: PostgreSQL集群方案相关索引页 回到顶级页面:PostgreSQL索引页磨砺技术珠矶,践行数据之道,追求卓越价值
PostgreSQL Streaming Replication的FATAL ERROR的更多相关文章
- 配置PostgreSQL Streaming Replication集群
运行环境: Primary: 192.168.0.11 Standby: 192.168.0.21, 192.168.0.22 OS: CentOS 6.2 PostgreSQL: 9.1.2 版本以 ...
- postgresql Streaming Replication监控与注意事项
一监控Streaming Replication集群 1 pg_stat_replication视图(主库端执行) pid Wal sender process的进程ID usesysid 执行流复制 ...
- MySQL Got fatal error 1236原因和解决方法【转】
本文来自:http://blog.itpub.net/22664653/viewspace-1714269/ 一 前言 MySQL 的主从复制作为一项高可用特性,用于将主库的数据同步到从库,在维护主 ...
- Streaming replication slots in PostgreSQL 9.4
Streaming replication slots are a pending feature in PostgreSQL 9.4, as part of the logical changese ...
- PostgreSQL的streaming replication
磨砺技术珠矶,践行数据之道,追求卓越价值回到上一级页面: PostgreSQL集群方案相关索引页 回到顶级页面:PostgreSQL索引页[作者 高健@博客园 luckyjackgao@gm ...
- PostgreSQL 9.3 Streaming Replication 状态监控
postgresql是使用Streaming Replication来实现热备份的,热备份的作用如下: 灾难恢复 高可用性 负载均衡,当你使用Streaming Replication来实现热备份(h ...
- MySql配置主从模式 Last_IO_Error: Fatal error: The slave I/O thread stops because master and slave have equal MySQL server UUIDs; these UUIDs must be different for replication to work.
今天在学习MyCat环境搭建的时候,在配置MySql的主从模式,发现slave在配置完毕后,配置的内容全部正确的情况下,报错了? Last_IO_Error: Fatal error: The sla ...
- 配置MySQL主从复制报错Last_IO_Error: Fatal error: The slave I/O thread stops because master and slave have equal MySQL server ids; these ids must be different for replication to work
配置MySQL主从复制报错 ``` Last_IO_Error: Fatal error: The slave I/O thread stops because master and slave ha ...
- PostgreSQL Cascade Replication
PostgreSQL Cascade Replication node1:master:10.2.208.10:repclia(user) node2:upstreamnode:10.2.208.11 ...
随机推荐
- webpack和gulp
在没有使用任何自动化工具之前,如果用sass写了css, 用coffee写了js, 那么我们必须手动用相应的compiler去编译各自的文件,然后各自minify(js,css文件压缩并合并).这时如 ...
- css3鼠标经过出现转圈菜单(仿)
<!DOCTYPE html> <html> <head lang="en"> <meta charset="UTF-8&quo ...
- Guava包学习---Lists
Guava包是我最近项目中同事推荐使用的,是google推出的库.里面的功能非常多,包括了集合.缓存.原生类型支持.并发库.通用注解.字符串处理.IO等.我们项目中使用到了guava依赖,但是实际上只 ...
- [19/04/18-星期四] Java的动态性_动态编译(DynamicCompiler,Dynamic:动态的,Compiler:编译程序)
一.概念 应用场景:如在线评测系统,客户端编写代码,上传到服务器端编译运行:服务器动态加载某些类文件进行编译 /*** * */ package cn.sxt.jvm; import java.io. ...
- Java50道经典习题-程序21 求阶乘
题目:求1+2!+3!+...+20!的和分析:使用递归求解 0的阶乘和1的阶乘都为1 public class Prog21{ public static void main(String[] ar ...
- 【CSS】iconfont的使用
说到浏览器对@font-face的兼容问题,这里涉及到一个字体format的问题,因为不同的浏览器对字体格式支持是不一致的,这样大家有必要了解一下,各种版本的浏览器支持什么样的字体,前面也简单带到了有 ...
- 20155203 2016-2017-4 《Java程序设计》第9周学习总结
20155203 2016-2017-4 <Java程序设计>第9周学习总结 教材学习内容总结 课堂内容 两个类如果有公共的部分要放在父类中,多次复用.当我们用父类或接口去声明对象的引用生 ...
- VIM在Win7上的安装教程
1.下载 目前VIM在其官网上的最新版本为7.4,Windows版本名称为GVIM,在百度软件中可以下载GVIM的最新版本,建议 在百度上下载,因为比较快.在百度上搜索"GVIM" ...
- 关于python线程池threadpool
#coding=utf-8 import time import threadpool def wait_time(n): print('%d\n' % n) time.sleep(2) #在线程池中 ...
- ASP.NET如何批量保存动态生成的文本框?
对于OA系统,表单签核功能必不可少.而根据公司的情况,表单自然又五花八门,所以就要求能够让用户自己建立表单并设定表单的流程.填写内容等等.我之前写过一篇文章[地址:pivot的用法(SQL SERVE ...