基于binlog来分析mysql的行记录修改情况

https://www.cnblogs.com/xinysu/archive/2017/05/26/6908722.html

import pymysql
from pymysql.cursors import DictCursor
import re
import os
import sys
import datetime
import time
import logging
import importlib
importlib.reload(logging)
logging.basicConfig(level=logging.DEBUG,format='%(asctime)s %(levelname)s %(message)s ')

usage=''' usage: python [script's path] [option]
ALL options need to assign:

-h : host, the database host，which database will store the results after analysis
-u : user, the db user
-p : password, the db user's password
-P : port, the db port
-f : file path, the binlog file
-tr : table name for record , the table name to store the row record
-tt : table name for transaction, the table name to store transactions
Example: python queryanalyse.py -h=127.0.0.1 -P=3310 -u=root -p=password -f=/tmp/stock_binlog.log -tt=flashback.tbtran -tr=flashback.tbrow

'''

class queryanalyse:
def __init__(self):
#初始化
self.host=''
self.user=''
self.password=''
self.port='3306'
self.fpath=''
self.tbrow=''
self.tbtran=''

self._get_db()
logging.info('assign values to parameters is done:host={},user={},password=***,port={},fpath={},tb_for_record={},tb_for_tran={}'.format(self.host,self.user,self.port,self.fpath,self.tbrow,self.tbtran))

self.mysqlconn = pymysql.connect(host=self.host, user=self.user, password=self.password, port=self.port,charset='utf8')
self.cur = self.mysqlconn.cursor(cursor=DictCursor)
logging.info('MySQL which userd to store binlog event connection is ok')

self.begin_time=''
self.end_time=''
self.db_name=''
self.tb_name=''

def _get_db(self):
#解析用户输入的选项参数值，这里对password的处理是明文输入，可以自行处理成是input格式，
#由于可以拷贝binlog文件到非线上环境分析，所以password这块，没有特殊处理
logging.info('begin to assign values to parameters')
if len(sys.argv) == 1:
print(usage)
sys.exit(1)
elif sys.argv[1] == '--help':
print(usage)
sys.exit()
elif len(sys.argv) > 2:
for i in sys.argv[1:]:
_argv = i.split('=')
if _argv[0] == '-h':
self.host = _argv[1]
elif _argv[0] == '-u':
self.user = _argv[1]
elif _argv[0] == '-P':
self.port = int(_argv[1])
elif _argv[0] == '-f':
self.fpath = _argv[1]
elif _argv[0] == '-tr':
self.tbrow = _argv[1]
elif _argv[0] == '-tt':
self.tbtran = _argv[1]
elif _argv[0] == '-p':
self.password = _argv[1]
else:
print(usage)

def create_tab(self):
#创建两个表格：一个用户存储事务情况，一个用户存储每一行数据修改的情况
#注意，一个事务可以存储多行数据修改的情况
logging.info('creating table ...')
create_tb_sql ='''CREATE TABLE IF NOT EXISTS {} (
`auto_id` int(10) unsigned NOT NULL AUTO_INCREMENT,
`begin_time` datetime NOT NULL,
`end_time` datetime NOT NULL,
PRIMARY KEY (`auto_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
CREATE TABLE IF NOT EXISTS {} (
`auto_id` int(10) unsigned NOT NULL AUTO_INCREMENT,
`sqltype` int(11) NOT NULL COMMENT '1 is insert,2 is update,3 is delete',
`tran_num` int(11) NOT NULL COMMENT 'the transaction number',
`dbname` varchar(50) NOT NULL,
`tbname` varchar(50) NOT NULL,
PRIMARY KEY (`auto_id`),
KEY `sqltype` (`sqltype`),
KEY `dbname` (`dbname`),
KEY `tbname` (`tbname`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
truncate table {};
truncate table {};
'''.format(self.tbtran,self.tbrow,self.tbtran,self.tbrow)

self.cur.execute(create_tb_sql)
logging.info('created table {} and {}'.format(self.tbrow,self.tbtran))

def rowrecord(self):
#处理每一行binlog
#事务的结束采用 'Xid =' 来划分
#分析结果，按照一个事务为单位存储提交一次到db
try:
tran_num=1 #事务数
record_sql='' #行记录的insert sql
tran_sql='' #事务的insert sql

self.create_tab()

with open(self.fpath,'r') as binlog_file:
logging.info('begining to analyze the binlog file ,this may be take a long time !!!')
logging.info('analyzing...')

for bline in binlog_file:

if bline.find('Table_map:') != -1:
l = bline.index('server')
n = bline.index('Table_map')
begin_time = bline[:l:].rstrip(' ').replace('#', '20')

if record_sql=='':
self.begin_time = begin_time[0:4] + '-' + begin_time[4:6] + '-' + begin_time[6:]

self.db_name = bline[n::].split(' ')[1].replace('`', '').split('.')[0]
self.tb_name = bline[n::].split(' ')[1].replace('`', '').split('.')[1]
bline=''

elif bline.startswith('### INSERT INTO'):
record_sql=record_sql+"insert into {}(sqltype,tran_num,dbname,tbname) VALUES (1,{},'{}','{}');".format(self.tbrow,tran_num,self.db_name,self.tb_name)

elif bline.startswith('### UPDATE'):
record_sql=record_sql+"insert into {}(sqltype,tran_num,dbname,tbname) VALUES (2,{},'{}','{}');".format(self.tbrow,tran_num,self.db_name,self.tb_name)

elif bline.startswith('### DELETE FROM'):
record_sql=record_sql+"insert into {}(sqltype,tran_num,dbname,tbname) VALUES (3,{},'{}','{}');".format(self.tbrow,tran_num,self.db_name,self.tb_name)

elif bline.find('Xid =') != -1:

l = bline.index('server')
end_time = bline[:l:].rstrip(' ').replace('#', '20')
self.end_time = end_time[0:4] + '-' + end_time[4:6] + '-' + end_time[6:]
tran_sql=record_sql+"insert into {}(begin_time,end_time) VALUES ('{}','{}')".format(self.tbtran,self.begin_time,self.end_time)

self.cur.execute(tran_sql)
self.mysqlconn.commit()
record_sql = ''
tran_num += 1

except Exception:
return 'funtion rowrecord error'

def binlogdesc(self):
sql=''
t_num=0
r_num=0
logging.info('Analysed result printing...\n')
#分析总的事务数跟行修改数量
sql="select 'tbtran' name,count(*) nums from {} union all select 'tbrow' name,count(*) nums from {};".format(self.tbtran,self.tbrow)
self.cur.execute(sql)
rows=self.cur.fetchall()
for row in rows:
if row['name']=='tbtran':
t_num = row['nums']
else:
r_num = row['nums']
print('This binlog file has {} transactions, {} rows are changed '.format(t_num,r_num))

# 计算最耗时的单个事务
# 分析每个事务的耗时情况,分为5个时间段来描述
# 这里正常应该是以毫秒来分析的，但是binlog中，只精确时间到second
sql='''select
count(case when cost_sec between 0 and 1 then 1 end ) cos_1,
count(case when cost_sec between 1.1 and 5 then 1 end ) cos_5,
count(case when cost_sec between 5.1 and 10 then 1 end ) cos_10,
count(case when cost_sec between 10.1 and 30 then 1 end ) cos_30,
count(case when cost_sec >30.1 then 1 end ) cos_more,
max(cost_sec) cos_max
from
(
select
auto_id,timestampdiff(second,begin_time,end_time) cost_sec
from {}
) a;'''.format(self.tbtran)
self.cur.execute(sql)
rows=self.cur.fetchall()

for row in rows:
print('The most cost time : {} '.format(row['cos_max']))
print('The distribution map of each transaction costed time: ')
print('Cost time between 0 and 1 second : {} , {}%'.format(row['cos_1'],int(row['cos_1']*100/t_num)))
print('Cost time between 1.1 and 5 second : {} , {}%'.format(row['cos_5'], int(row['cos_5'] * 100 / t_num)))
print('Cost time between 5.1 and 10 second : {} , {}%'.format(row['cos_10'], int(row['cos_10'] * 100 / t_num)))
print('Cost time between 10.1 and 30 second : {} , {}%'.format(row['cos_30'], int(row['cos_30'] * 100 / t_num)))
print('Cost time > 30.1 : {} , {}%\n'.format(row['cos_more'], int(row['cos_more'] * 100 / t_num)))

# 计算单个事务影响行数最多的行数量
# 分析每个事务影响行数情况,分为5个梯度来描述
sql='''select
count(case when nums between 0 and 10 then 1 end ) row_1,
count(case when nums between 11 and 100 then 1 end ) row_2,
count(case when nums between 101 and 1000 then 1 end ) row_3,
count(case when nums between 1001 and 10000 then 1 end ) row_4,
count(case when nums >10001 then 1 end ) row_5,
max(nums) row_max
from
(
select
count(*) nums
from {} group by tran_num
) a;'''.format(self.tbrow)
self.cur.execute(sql)
rows=self.cur.fetchall()

for row in rows:
print('The most changed rows for each row: {} '.format(row['row_max']))
print('The distribution map of each transaction changed rows : ')
print('Changed rows between 1 and 10 second : {} , {}%'.format(row['row_1'],int(row['row_1']*100/t_num)))
print('Changed rows between 11 and 100 second : {} , {}%'.format(row['row_2'], int(row['row_2'] * 100 / t_num)))
print('Changed rows between 101 and 1000 second : {} , {}%'.format(row['row_3'], int(row['row_3'] * 100 / t_num)))
print('Changed rows between 1001 and 10000 second : {} , {}%'.format(row['row_4'], int(row['row_4'] * 100 / t_num)))
print('Changed rows > 10001 : {} , {}%\n'.format(row['row_5'], int(row['row_5'] * 100 / t_num)))

# 分析各个行数 DML的类型情况
# 描述 delete，insert，update的分布情况
sql='select sqltype ,count(*) nums from {} group by sqltype ;'.format(self.tbrow)
self.cur.execute(sql)
rows=self.cur.fetchall()

print('The distribution map of the {} changed rows : '.format(r_num))
for row in rows:

if row['sqltype']==1:
print('INSERT rows :{} , {}% '.format(row['nums'],int(row['nums']*100/r_num)))
if row['sqltype']==2:
print('UPDATE rows :{} , {}% '.format(row['nums'],int(row['nums']*100/r_num)))
if row['sqltype']==3:
print('DELETE rows :{} , {}%\n '.format(row['nums'],int(row['nums']*100/r_num)))

# 描述影响行数最多的表格
# 可以分析是哪些表格频繁操作，这里显示前10个table name
sql = '''select
dbname,tbname ,
count(*) ALL_rows,
count(*)*100/{} per,
count(case when sqltype=1 then 1 end) INSERT_rows,
count(case when sqltype=2 then 1 end) UPDATE_rows,
count(case when sqltype=3 then 1 end) DELETE_rows
from {}
group by dbname,tbname
order by ALL_rows desc
limit 10;'''.format(r_num,self.tbrow)
self.cur.execute(sql)
rows = self.cur.fetchall()

print('The distribution map of the {} changed rows : '.format(r_num))
print('tablename'.ljust(50),
'|','changed_rows'.center(15),
'|','percent'.center(10),
'|','insert_rows'.center(18),
'|','update_rows'.center(18),
'|','delete_rows'.center(18)
)
print('-------------------------------------------------------------------------------------------------------------------------------------------------')
for row in rows:
print((row['dbname']+'.'+row['tbname']).ljust(50),
'|',str(row['ALL_rows']).rjust(15),
'|',(str(int(row['per']))+'%').rjust(10),
'|',str(row['INSERT_rows']).rjust(10)+' , '+(str(int(row['INSERT_rows']*100/row['ALL_rows']))+'%').ljust(5),
'|',str(row['UPDATE_rows']).rjust(10)+' , '+(str(int(row['UPDATE_rows']*100/row['ALL_rows']))+'%').ljust(5),
'|',str(row['DELETE_rows']).rjust(10)+' , '+(str(int(row['DELETE_rows']*100/row['ALL_rows']))+'%').ljust(5),
)
print('\n')

logging.info('Finished to analyse the binlog file !!!')

def closeconn(self):
self.cur.close()
logging.info('release db connections\n')

def main():
p = queryanalyse()
p.rowrecord()
p.binlogdesc()
p.closeconn()

if __name__ == "__main__":
main()

基于binlog来分析mysql的行记录修改情况的更多相关文章

基于binlog来分析mysql的行记录修改情况（python脚本分析）
最近写完mysql flashback,突然发现还有有这种使用场景:有些情况下,可能会统计在某个时间段内,MySQL修改了多少数据量?发生了多少事务?主要是哪些表格发生变动?变动的数量是怎 ...
使用Anemometer分析MySQL慢查询记录
数据库管理员一般是用percona的toolkit工具来分析MySQL慢查询记录,但是不够直观. 下面介绍一款比较直观的工具来统计分析MySQL慢查询记录anemometer. 在使用之前需要安装pe ...
mysql之行(记录)的详细操作
在Mysql管理软件中, 可以通过sql语句中的dml语言来实现数据的操作, 包括使用INSERT实现数据的插入 UPDATE实现数据的更新使用DELETE实现数据的删除使用SELECT查询数据 ...
MySql之行记录的详细操作,创建用户以及库表的授权
一介绍 MySQL数据操作: DML ======================================================== 在MySQL管理软件中,可以通过SQL语句中的 ...
mysql基于binlog回滚工具_flashback（python版本）
update.delete的条件写错甚至没有写,导致数据操作错误,需要恢复被误操作的行记录.这种情形,其实时有发生,可以选择用备份文件+binlog来恢复到测试环境,然后再做数据修复,但是这样 ...
百万年薪python之路 -- MySQL数据库之 MySQL行(记录)的操作(一)
MySQL的行(记录)的操作(一) 1. 增(insert) insert into 表名 value((字段1,字段2...); # 只能增加一行记录 insert into 表名 values(字 ...
MySQL数据库备份还原（基于binlog的增量备份）
MySQL数据库备份还原(基于binlog的增量备份) 一.简介 1.增量备份增量备份是指在一次全备份或上一次增量备份后,以后每次的备份只需备份与前一次相比增加或者被修改的文件.这就意味 ...
MySQL基于binlog主从复制
MySQL复制介绍默认情况下复制是异步进行的,从库也不需要一直连接到主库来同步数据 MySQL复制的数据粒度可以是主实例上所有的数据库,也可以是指定的一个或多个数据库 ,也可以是一个数据库里的指定 ...
（4.11）mysql备份还原——mysql闪回技术（基于binlog）
0.闪回技术与工具简介 mysql闪回工具比较流行三大类: [0.1]官方的mysqlbinlog:支持数据库在线/离线,用脚本处理binlog的输出,转化成对应SQL再执行.通用性不好,对正则.se ...

随机推荐

反转链表(python3)
问题描述: 反转一个单链表. 示例: 输入: 1->2->3->4->5->NULL 输出: 5->4->3->2->1->NULL解法1: ...
Invocation of init method failed; nested exception is java.text.ParseException: '?' can only be specfied for Day-of-Month or Day-of-Week.
org.springframework.beans.factory.BeanCreationException: Error creating bean with name 'cronTrigger' ...
http状态码301和302的区别
1.官方的比较简洁的说明: 301 redirect: 301 代表永久性转移(Permanently Moved) 302 redirect: 302 代表暂时性转移(Temporarily Mov ...
在Microsoft Power BI中创建地图的10种方法
今天,我们来简单聊一聊“地图”. 在我们日常生活中,地图地位已经提升的越来越高,出门聚餐.驾驶.坐车.旅行......应运而生的就是各种Map APP. 作为数据分析师,我们今天不讲生活地图,要跟大家 ...
lava.lang.String数据转换为java.sql.Date
在JavaWeb编程中,往往涉及数据库,javaBean,前端数据数据类型不一致的问题数据库和javaBean之间可以直接选择相对应的数据类型,而serverlet从前端获取的数据往往是String ...
Java 面试题 —— java 源码
1. 静态工厂方法静态工厂方法不必在每次调用它们的时候都创建一个新的对象: Boolean.valueOf(boolean): public final class Boolean { public ...
WEBBASE篇：第三篇， CSS知识1
第三篇, CSS知识1 一,CSS 介绍 CSS: Cascading Style Sheets ---样式表 HTML: 搭建网页结构: CSS: 在网页结构基础上进行网页的美化: 二,CSS的使用 ...
利用selenium模拟登录webqq
from selenium import webdriver import selenium.webdriver.support.ui as ui import time opt = webdrive ...
fixed不能罩住下面的内容
fix的优先级并不是最高的,所以要设置z-index,比它下面的元素高就能遮住了
2018.4.23 git常用操作命令收集(转)
Git常用操作命令收集: 1. 远程仓库相关命令检出仓库:$ git clone git://github.com/jquery/jquery.git 查看远程仓库:$ git remote -v ...

基于binlog来分析mysql的行记录修改情况

基于binlog来分析mysql的行记录修改情况的更多相关文章

随机推荐

热门专题