磨砺技术珠矶,践行数据之道,追求卓越价值
回到上一级页面: PostgreSQL集群方案相关索引页     回到顶级页面:PostgreSQL索引页[作者 高健@博客园  luckyjackgao@gmail.com]

首先,通过代码,查看调用关系:

libpqwalreceiver.c            _PG_init 関数                

/*
* Module load callback
*/
void
_PG_init(void)
{
/* Tell walreceiver how to reach us */
if (walrcv_connect != NULL || walrcv_receive != NULL ||
walrcv_send != NULL || walrcv_disconnect != NULL)
elog(ERROR, "libpqwalreceiver already loaded"); walrcv_connect = libpqrcv_connect;
walrcv_receive = libpqrcv_receive;
walrcv_send = libpqrcv_send;
walrcv_disconnect = libpqrcv_disconnect;
}

再看

walreceiver.c            WalReceiverMain 関数                     

/* libpqreceiver hooks to these when loaded */
walrcv_connect_type walrcv_connect = NULL;
walrcv_receive_type walrcv_receive = NULL;
walrcv_send_type walrcv_send = NULL;
walrcv_disconnect_type walrcv_disconnect = NULL; … /* Main entry point for walreceiver process */
void
WalReceiverMain(void)
{

/* Load the libpq-specific functions */
load_file("libpqwalreceiver", false);
if (walrcv_connect == NULL || walrcv_receive == NULL ||
walrcv_send == NULL || walrcv_disconnect == NULL)
elog(ERROR, "libpqwalreceiver didn't initialize correctly");
… /* Establish the connection to the primary for XLOG streaming */
EnableWalRcvImmediateExit();
walrcv_connect(conninfo, startpoint);
DisableWalRcvImmediateExit(); /* Initialize LogstreamResult, reply_message and feedback_message */
LogstreamResult.Write = LogstreamResult.Flush = GetXLogReplayRecPtr(NULL);
MemSet(&reply_message, , sizeof(reply_message));
MemSet(&feedback_message, , sizeof(feedback_message)); /* Loop until end-of-streaming or error */
for (;;)
{

/* Wait a while for data to arrive */
if (walrcv_receive(NAPTIME_PER_CYCLE, &type, &buf, &len))
{
/* Accept the received data, and process it */
XLogWalRcvProcessMsg(type, buf, len); /* Receive any more data we can without sleeping */
while (walrcv_receive(, &type, &buf, &len))
XLogWalRcvProcessMsg(type, buf, len); /* Let the master know that we received some data. */
XLogWalRcvSendReply(); /*
* If we've written some records, flush them to disk and let the
* startup process and primary server know about them.
*/
XLogWalRcvFlush(false);
}
else
{
/*
* We didn't receive anything new, but send a status update to the
* master anyway, to report any progress in applying WAL.
*/
XLogWalRcvSendReply();
XLogWalRcvSendHSFeedback();
}
}
}

再看

 libpqwalreceiver.c            libpqrcv_receive 関数                        

/*
* Receive a message available from XLOG stream, blocking for
* maximum of 'timeout' ms.
*
* Returns:
*
* True if data was received. *type, *buffer and *len are set to
* the type of the received data, buffer holding it, and length,
* respectively.
*
* False if no data was available within timeout, or wait was interrupted
* by signal.
*
* The buffer returned is only valid until the next call of this function or
* libpq_connect/disconnect.
*
* ereports on error.
*/
static bool
libpqrcv_receive(int timeout, unsigned char *type, char **buffer, int *len)
{
int rawlen; if (recvBuf != NULL)
PQfreemem(recvBuf);
recvBuf = NULL; /* Try to receive a CopyData message */
rawlen = PQgetCopyData(streamConn, &recvBuf, ); if (rawlen == )
{
/*
* No data available yet. If the caller requested to block, wait for
* more data to arrive.
*/
if (timeout > )
{
if (!libpq_select(timeout))
return false;
} if (PQconsumeInput(streamConn) == )
ereport(ERROR,
(errmsg("could not receive data from WAL stream: %s",
PQerrorMessage(streamConn)))); /* Now that we've consumed some input, try again */
rawlen = PQgetCopyData(streamConn, &recvBuf, );
if (rawlen == )
return false;
} if (rawlen == -) /* end-of-streaming or error */
{
PGresult *res; res = PQgetResult(streamConn);
if (PQresultStatus(res) == PGRES_COMMAND_OK)
{
PQclear(res);
ereport(ERROR,
(errmsg("replication terminated by primary server")));
}
PQclear(res); ereport(ERROR,
(errmsg("could not receive data from WAL stream: %s",
PQerrorMessage(streamConn))));
} if (rawlen < -)
ereport(ERROR,
(errmsg("could not receive data from WAL stream: %s",
PQerrorMessage(streamConn)))); /* Return received messages to caller */
*type = *((unsigned char *) recvBuf);
*buffer = recvBuf + sizeof(*type);
*len = rawlen - sizeof(*type); return true;
}

再看:

    fe-exec.c        PQgetCopyData 関数                        

    /*
* PQgetCopyData - read a row of data from the backend during COPY OUT
* or COPY BOTH
*
* If successful, sets *buffer to point to a malloc'd row of data, and
* returns row length (always > 0) as result.
* Returns 0 if no row available yet (only possible if async is true),
* -1 if end of copy (consult PQgetResult), or -2 if error (consult
* PQerrorMessage).
*/
int
PQgetCopyData(PGconn *conn, char **buffer, int async)
{
*buffer = NULL; /* for all failure cases */
if (!conn)
return -;
if (conn->asyncStatus != PGASYNC_COPY_OUT &&
conn->asyncStatus != PGASYNC_COPY_BOTH)
{
printfPQExpBuffer(&conn->errorMessage,
libpq_gettext("no COPY in progress\n"));
return -;
}
if (PG_PROTOCOL_MAJOR(conn->pversion) >= )
return pqGetCopyData3(conn, buffer, async);
else
return pqGetCopyData2(conn, buffer, async);
}

还有这个:

    fe-exec.c        PQgetCopyData 関数                        

    /*
* PQgetCopyData - read a row of data from the backend during COPY OUT
* or COPY BOTH
*
* If successful, sets *buffer to point to a malloc'd row of data, and
* returns row length (always > 0) as result.
* Returns 0 if no row available yet (only possible if async is true),
* -1 if end of copy (consult PQgetResult), or -2 if error (consult
* PQerrorMessage).
*/
int
PQgetCopyData(PGconn *conn, char **buffer, int async)
{
*buffer = NULL; /* for all failure cases */
if (!conn)
return -;
if (conn->asyncStatus != PGASYNC_COPY_OUT &&
conn->asyncStatus != PGASYNC_COPY_BOTH)
{
printfPQExpBuffer(&conn->errorMessage,
libpq_gettext("no COPY in progress\n"));
return -;
}
if (PG_PROTOCOL_MAJOR(conn->pversion) >= )
return pqGetCopyData3(conn, buffer, async);
else
return pqGetCopyData2(conn, buffer, async);
}

事实上,从上面的逻辑,可以看到:

如果slave端读取wal,发生了错误,那么它会在循环中再次试图读取,知道成功为止。

所以,出错了不要紧。当然这种 FATAL ERROR出现,肯定是要引起重视的。

经过试验,发现有几种可能会出现错误:

错误发生原因1:如果没有采用 archive log模式,那么当master端事务任务过重,导致在线wal log很快被删除覆盖,那么slave端就会找不到相应的wal log,于是在master端和客户端都出现:

FATAL:  could not receive data from WAL stream: FATAL:  requested WAL segment 0000000100000000000000XX has already been removed

错误发生原因2:

如果在master端的postgresql.conf文件中,设置了 replication_timeout,但wal_receiver_status_interval 却等于零,

则经过 replication_timeout秒后,如果master和slave之间因为master不忙等原因没有通信,则master会主动把这个连接干掉。

所以此时

master端出现:LOG:terminating walsender process due to replication timeout

slave端出现:   FATAL: could not receive data from WAL stream:

可能的错误发生原因3:

这可能是和 recovery.conf 中的primary_conninfo有关:

例如:

primary_conninfo = 'host=master port=5432 application_name=mypg user=postgres connect_timeout=10 keepalives_idle=10 keepalives_interval=1 keepalives_count=3'

这样,每隔10秒,为了看看当前连接是否已经失效,就要发送3个keepalive数据包,如果在1秒的时间里没有得到对方响应,那么就认为连接已经死掉。

这样,如果master端的通讯比较繁忙,可能来不及应答,这样就可能发生 FATAL: could not receive data from WAL stream: could not receive data from server: connection timeout error,目前此种情况尚未再现出来,尚需验证。

下面这段话,说明了hot-standby 的中间过程:

http://www.postgresql.org/docs/9.2/static/warm-standby.html

In standby mode, the server continuously applies WAL received from the master server. The standby server can read WAL from a WAL archive (see restore_command) or directly from the master over a TCP connection (streaming replication). The standby server will also attempt to restore any WAL found in the standby cluster's pg_xlog directory. That typically happens after a server restart, when the standby replays again WAL that was streamed from the master before the restart, but you can also manually copy files to pg_xlog at any time to have them replayed.

At startup, the standby begins by restoring all WAL available in the archive location, calling restore_command. Once it reaches the end of WAL available there and restore_command fails, it tries to restore any WAL available in the pg_xlog directory. If that fails, and streaming replication has been configured, the standby tries to connect to the primary server and start streaming WAL from the last valid record found in archive or pg_xlog. If that fails or streaming replication is not configured, or if the connection is later disconnected, the standby goes back to step 1 and tries to restore the file from the archive again. This loop of retries from the archive, pg_xlog, and via streaming replication goes on until the server is stopped or failover is triggered by a trigger file.

就是说: standby server一旦启动,就会按照 archive directory  --> pg_xlog directory ---> streaming replication 的顺序来应用 wal log。

所以,单纯由于网络环境造成出错的可能性比较大。

[作者 高健@博客园  luckyjackgao@gmail.com]
回到上一级页面: PostgreSQL集群方案相关索引页     回到顶级页面:PostgreSQL索引页磨砺技术珠矶,践行数据之道,追求卓越价值

PostgreSQL Streaming Replication的FATAL ERROR的更多相关文章

  1. 配置PostgreSQL Streaming Replication集群

    运行环境: Primary: 192.168.0.11 Standby: 192.168.0.21, 192.168.0.22 OS: CentOS 6.2 PostgreSQL: 9.1.2 版本以 ...

  2. postgresql Streaming Replication监控与注意事项

    一监控Streaming Replication集群 1 pg_stat_replication视图(主库端执行) pid Wal sender process的进程ID usesysid 执行流复制 ...

  3. MySQL Got fatal error 1236原因和解决方法【转】

    本文来自:http://blog.itpub.net/22664653/viewspace-1714269/ 一 前言  MySQL 的主从复制作为一项高可用特性,用于将主库的数据同步到从库,在维护主 ...

  4. Streaming replication slots in PostgreSQL 9.4

    Streaming replication slots are a pending feature in PostgreSQL 9.4, as part of the logical changese ...

  5. PostgreSQL的streaming replication

    磨砺技术珠矶,践行数据之道,追求卓越价值回到上一级页面: PostgreSQL集群方案相关索引页     回到顶级页面:PostgreSQL索引页[作者 高健@博客园  luckyjackgao@gm ...

  6. PostgreSQL 9.3 Streaming Replication 状态监控

    postgresql是使用Streaming Replication来实现热备份的,热备份的作用如下: 灾难恢复 高可用性 负载均衡,当你使用Streaming Replication来实现热备份(h ...

  7. MySql配置主从模式 Last_IO_Error: Fatal error: The slave I/O thread stops because master and slave have equal MySQL server UUIDs; these UUIDs must be different for replication to work.

    今天在学习MyCat环境搭建的时候,在配置MySql的主从模式,发现slave在配置完毕后,配置的内容全部正确的情况下,报错了? Last_IO_Error: Fatal error: The sla ...

  8. 配置MySQL主从复制报错Last_IO_Error: Fatal error: The slave I/O thread stops because master and slave have equal MySQL server ids; these ids must be different for replication to work

    配置MySQL主从复制报错 ``` Last_IO_Error: Fatal error: The slave I/O thread stops because master and slave ha ...

  9. PostgreSQL Cascade Replication

    PostgreSQL Cascade Replication node1:master:10.2.208.10:repclia(user) node2:upstreamnode:10.2.208.11 ...

随机推荐

  1. TreadingTCPServer

    TreadingTCPServer实现的socket服务器内部会为每个client创建一个线程,该线程用来和客户端进行交互. 1.TreadingTCPServer基础 使用TreadingTCPSe ...

  2. python面向对象之类成员修饰符

      类的所有成员分为: 公有成员,在任何地方都能访问 私有成员,只有在类的内部才能访问 私有成员和公有成员的定义不同:私有成员命名时,前两个字符是下划线.(特殊成员除外,例如:__init__.__c ...

  3. angular2 Router类中的路由跳转navigate

    navigate是Router类的一个方法,主要用来路由跳转. 函数定义 navigate(commands: any[], extras?: NavigationExtras) : Promise` ...

  4. 20145324 Java实验三

    一.git 上传代码步骤 上传结果 原代码 下载同学代码 更改 二.重构 原代码 rename 原代码 实验总结 这次实验比较简单,而且终于解决了git的问题,很开心 步骤 耗时 百分比 需求分析 1 ...

  5. `ECS弹性计算服务

    云服务器(Elastic Compute Service 简称ECS)是一种简单高效,处理能力可弹性伸缩的计算服务.能快速构建更稳定.安全的应用,提升运维效率,降低IT成本. 云服务器ecs作用如下: ...

  6. geomesa hbase geoserver

    在geoserver中配置hbase ln -s /root/hbase/hbase-1.4.8/conf/hbase-site.xml /root/tomcat/apache-tomcat-7.0. ...

  7. Maven创建项目一些常见的问题

    1 .创建的项目中没有src/main/java.没有src/test/java 主要原因在于在创建项目的时候,使用的是系统自带的jdk,修改方法: 右键项目——Properties——javaBui ...

  8. unordered_map 遇到 vector subscript out of range 的错误提示

    错误类型 当调用unordered_map的函数的时候,会出现如下问题: 使用linux运行则会提示 float exeption(core dump) 原因 遇到vector subscript o ...

  9. 精准测试白皮书v3.0-2019最新版

    现代社会是建立在各种以计算机为基石的软件技术基础之上的.随着日新月异的需求变化,软件系统越来越复杂.很多人觉得软件开发才是重要环节,但实际上,无法对大型软件进行有效的质量把控,就无法真正构建与维护大型 ...

  10. TensorFlow简要教程及线性回归算法示例

    TensorFlow是谷歌推出的深度学习平台,目前在各大深度学习平台中使用的最广泛. 一.安装命令 pip3 install -U tensorflow --default-timeout=1800 ...