python 读取mysql存储的文件路径下载文件,内容解析,上传七牛云,内容入es
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import ConfigParser
import json
import os
import re
from re import sub
import sys
import time
import requests
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfparser import PDFParser
from qiniu import Auth
from qiniu import etag
from qiniu import put_file
import log_config
from OP_Mysql import get_connection
from HTMLParser import HTMLParser
import random reload(sys)
sys.setdefaultencoding('utf-8')
logger = log_config.getlogger('analysis_neeq_content', 'analysis_neeq_content.log')
conf = ConfigParser.ConfigParser()
conf.read("mysql.conf")
neeq_remainder = conf.get("basic_config", "neeq_remainder")
neeq_server_num = conf.get("basic_config", "neeq_server_num")
neeq_start_id = conf.get("basic_config", "neeq_start_id")
neeq_json_path = conf.get("basic_config", "neeq_json_path")
neeq_json = conf.get("basic_config", "neeq_json")
json_suffix = '.json'
neeq_id = conf.get("basic_config", "neeq_id")
neeq_file_path = conf.get("basic_config", "neeq_file_path")
access_key = conf.get("basic_config", "access_key")
secret_key = conf.get("basic_config", "secret_key")
bucket = conf.get("basic_config", "bucket") class analysis:
def __init__(self):
# 用于文件追加
self.count = 0
self.neeq_json = neeq_json
self.headers = {'Host': 'www.neeq.com.cn',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36'
}
self.create_init_dirtory() # 创建初始文件夹
def create_init_dirtory(self):
if not os.path.exists(neeq_json_path):
os.makedirs(neeq_json_path)
if not os.path.exists(neeq_file_path):
os.makedirs(neeq_file_path) # mysql 获取数据
def get_data(self):
with get_connection() as db:
# SQL 查询语句
count = r"SELECT COUNT(*) as num FROM ssb_insight_neeq WHERE pro_status = 0 AND neeq_id <= %s and %s = (neeq_id %% %s)"
logger.info("now excute sql script sql = %s" % count)
try:
# 获取所有记录列表
db.cursor.execute(count, [neeq_start_id, neeq_remainder, neeq_server_num])
counts = db.cursor.fetchall()
num = counts[0]['num']
logger.info("now rows num = %s" % num)
if 0 != num % 1000:
pages = num / 1000 + 1
else:
pages = num / 1000
start_rows = 1000
for i in range(0, pages):
start_page = i * 1000 sql = "SELECT t.sec_code,t.sec_name,t.title,t.doc_type,t.doc_type_key,c.industry1,c.industry2," \
"t.url,t.public_time,t.content,t.pro_status,t.module,t.es_id FROM ssb_insight_neeq t " \
"LEFT JOIN ssb_d_listed_company c ON t.sec_code = c.secCode WHERE t.pro_status = 0 and t.neeq_id <= %s " \
"AND %s = (t.neeq_id %% %s) ORDER BY t.neeq_id DESC LIMIT %s ,%s" db.cursor.execute(sql, [neeq_start_id, neeq_remainder, neeq_server_num, start_page, start_rows])
result_datas = db.cursor.fetchall()
# 1000 数据放入此数组
json_data = []
es_id_file_addr = []
for row in result_datas:
item = {}
es_obj = {}
result = {'secCode': row['sec_code'],
'secName': row['sec_name'],
'title': row['title'],
'docType': row['doc_type'].split(','),
'docTypeKey': row['doc_type_key'].split(','),
'url': row['url'],
'publicTime': row['public_time'],
'industry1': row['industry1'],
'industry2': row['industry2'],
'content': row['content'],
'proStatus': bool(row['pro_status']),
'module': row['module'],
}
file_url = row['url']
self.download_file(file_url)
file_name = re.findall(r".*/(.*)", file_url)[0]
file_paths = neeq_file_path + file_name
if os.path.exists(file_paths):
content = self.analysis_file_content(file_paths)
self.upload_qiniu(file_paths)
self.del_file(file_paths)
if content == '':
continue
result['content'] = content else:
logger.warn("file_url %s download fail" % file_url)
continue
item['id'] = row['es_id']
item['data'] = result
json_data.append(item)
es_obj['es_id'] = row['es_id']
es_obj['file_addr'] = file_paths
es_id_file_addr.append(es_obj)
self.write_json_file(json_data)
self.write_es_id_file_addr(es_id_file_addr)
except Exception as e:
logger.error("Error: unable to fecth data Exception %s" % e) def write_json_file(self, json_data):
# 写数据
json_path = neeq_json_path + self.neeq_json + json_suffix
rows = self.get_json_rows(json_path)
if rows > 100000:
self.count = self.count + 1
self.neeq_json = neeq_json + str(self.count)
json_path = neeq_json_path + self.neeq_json + json_suffix
with open(json_path, 'a') as es_file:
for jsonitem in json_data:
jsondatar = json.dumps(jsonitem, ensure_ascii=True)
es_file.write(jsondatar+"\n") def write_es_id_file_addr(self, es_id_data):
# 写入es_id,以及 七牛云 地址
with open(neeq_id, 'a') as es_id_file:
for jsonitem in es_id_data:
es_id_file.write(jsonitem['es_id']+","+jsonitem['file_addr']+";"+"\n") # 获取json文件行数,用于分文件存储
def get_json_rows(self, json_path):
count = 0
if not os.path.exists(json_path):
return 0
thefile = open(json_path, 'rb')
while True:
buffer = thefile.read(8192 * 1024)
if not buffer:
break
count += buffer.count('\n')
thefile.close()
return count # 上传文件
def upload_qiniu(self, file_path_name):
q = Auth(access_key, secret_key)
# 生成上传 Token,可以指定过期时间等
token = q.upload_token(bucket, file_path_name, 3600)
# 要上传文件的本地路径
ret, info = put_file(token, file_path_name, file_path_name)
# logger.info(info)
if info.status_code != 200:
logger.info("file upload qiniuyun fail %s" % file_path_name) # 删除文件
def del_file(self, file_path_name):
if os.path.exists(file_path_name):
os.remove(file_path_name)
else:
logger.info("%s 文件不存在" % file_path_name) # 下载文件
def download_file(self, file_url):
time.sleep(random.uniform(1, 2))
retry = 0
try:
while retry < 3:
file_name = re.findall(r".*/(.*)", file_url)[0]
response = requests.get(file_url, stream=True, headers=self.headers, timeout=5)
if response.status_code == requests.codes.ok:
with open(neeq_file_path + file_name, "wb") as code:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
code.write(chunk)
break
except Exception as e:
logger.exception(e)
retry += 1 # 解析文件
def analysis_file_content(self, filename):
content = ''
fenzhihouzhui = re.findall(r'.*(\..*)', str(filename))[0]
if fenzhihouzhui == '.pdf' or fenzhihouzhui == '.PDF':
content = self.analysis_pdf_file_content(filename)
elif fenzhihouzhui == '.html' or fenzhihouzhui == '.HTML':
content = self.analysi_html_file_content(filename)
return content def analysis_pdf_file_content(self, filename):
content = ''
try:
fileobject = open(filename, 'rb')
parser = PDFParser(fileobject)
document = PDFDocument(parser)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
else:
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layout = device.get_result()
for x in layout:
if isinstance(x, LTTextBoxHorizontal):
results = x.get_text().encode('utf-8')
content += results
fileobject.close()
except Exception as e:
logger.error("analysis pdf file fail : %s" % e)
return content def analysi_html_file_content(self, filename):
content_open = open(filename, 'rb')
contents = content_open.read()
print contents
contents = dehtml(contents) class pythonNToTxt(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.__text = [] def handle_data(self, data):
text = data.strip()
if len(text) > 0:
text = sub('[ \t\r\n]+', ' ', text)
self.__text.append(text + ' ') def handle_starttag(self, tag, attrs):
if tag == 'p':
self.__text.append('\n\n')
elif tag == 'br':
self.__text.append('\n') def handle_startendtag(self, tag, attrs):
if tag == 'br':
self.__text.append('\n\n') def text(self):
return ''.join(self.__text).strip() def dehtml(text):
try:
parser = pythonNToTxt()
parser.feed(text)
parser.close()
return parser.text()
except Exception as e:
logger.error("html analysis excepiton : %s" % e)
return text logger.info("analysis neeq content start,now params neeq_remainder=%s,neeq_start_id =%s,neeq_json = %s,neeq_id = %s ,neeq_file_path = %s" % (neeq_remainder, neeq_start_id, neeq_json, neeq_id, neeq_file_path))
analysis = analysis()
analysis.get_data()
#!/usr/bin/env python
# -*- coding: utf-8 -*
import sys
import log_config
import ConfigParser
import pymysql
from DBUtils.PooledDB import PooledDB
reload(sys)
sys.setdefaultencoding('utf-8')
conf = ConfigParser.ConfigParser()
conf.read("mysql.conf")
user = conf.get("mysql", "user")
password = conf.get("mysql", "password")
database = conf.get("mysql", "database")
host = conf.get("mysql", "host")
port = conf.get("mysql", "port")
charset = "utf8" class OPMysql(object):
__pool = None def __init__(self):
# 构造函数,创建数据库连接、游标
pass def __enter__(self):
self.conn = self.getmysqlconn()
self.cursor = self.conn.cursor(cursor=pymysql.cursors.DictCursor)
return self def __exit__(self, typeq, value, trace):
self.cursor.close()
self.conn.close() # 数据库连接池连接
@staticmethod
def getmysqlconn():
if OPMysql.__pool is None:
__pool_a = PooledDB(creator=pymysql, mincached=1, maxcached=10, host=host, user=user, passwd=password, db=database, port=int(port), charset=charset)
OPMysql.__pool = __pool_a
return OPMysql.__pool.connection() def get_connection():
return OPMysql()
日志模块在前面随笔中
#------mysql basic config
[mysql]
user=用户名
password=密码
database=数据库
host=你的mysqlIp
port =3306 [basic_config]
#---------------neeq config
#余数为0
neeq_remainder = 0
#服务器台数
neeq_server_num = 6
neeq_start_id = 1000
neeq_json_path = neeq/json/
neeq_json = neeq
neeq_id = neeq/neeq_id.txt
neeq_file_path = neeq/file/
bucket = 七牛云bucket
access_key =你的七牛云access_key
secret_key = 你的七牛云secret_key
python 读取mysql存储的文件路径下载文件,内容解析,上传七牛云,内容入es的更多相关文章
- JAVA中实现根据文件路径下载文件
import javax.servlet.http.HttpServletResponse; import java.io.File; import java.io.FileInputStream; ...
- 由ASP.NET Core根据路径下载文件异常引发的探究
前言 最近在开发新的项目,使用的是ASP.NET Core6.0版本的框架.由于项目中存在文件下载功能,没有使用类似MinIO或OSS之类的分布式文件系统,而是下载本地文件,也就是根据本地文件路径进行 ...
- Delphi阿里云对象存储OSS【支持上传文件、下载文件、删除文件、创建目录、删除目录、Bucket操作等】
作者QQ:(648437169) 点击下载➨Delphi阿里云对象存储OSS 阿里云api文档 [Delphi阿里云对象存储OSS]支持 获取Bucket列表.设置Bucket ...
- js上传文件带参数,并且,返回给前台文件路径,解析上传的xml文件,存储到数据库中
ajaxfileupload.js jQuery.extend({ createUploadIframe: function(id, uri) { //create frame var frameId ...
- 【python】用python脚本Paramiko实现远程执行命令、下载、推送/上传文件功能
Paramiko: paramiko模块,基于SSH用于连接远程服务器并执行相关操作. SSHClient: 用于连接远程服务器并执行基本命令 SFTPClient: 用于连接远程服务器并执行上传下载 ...
- 文件上传和下载(可批量上传)——Spring(二)
针对SpringMVC的文件上传和下载.下载用之前“文件上传和下载——基础(一)”的依然可以,但是上传功能要修改,这是因为springMVC 都为我们封装好成自己的文件对象了,转换的过程就在我们所配置 ...
- 基于SpringMVC的文件(增删改查)上传、下载、更新、删除
一.项目背景 摘要:最近一直在忙着项目的事,3个项目过去了,发现有一个共同的业务,那就是附件的处理,附件包括各种文档,当然还有图片等特殊文件,由于时间的关系,每次都是匆匆忙忙的搞定上线,称这项目的空档 ...
- 微信小程序开发技巧总结(二) -- 文件的选取、移动、上传和下载
微信小程序开发技巧总结(二) -- 文件的选取.移动.上传和下载 1.不同类型文件的选取 1.1 常用的图片 视频 对于大部分开发者来说,需要上传的文件形式主要为图片,微信为此提供了接口. wx.ch ...
- 文件上传和下载(可批量上传)——Spring(三)
在文件上传和下载(可批量上传)——Spring(二)的基础上,发现了文件下载时,只有在Chrome浏览器下文件名正常显示,还有发布到服务器后,不能上传到指定的文件夹目录,如上传20160310.txt ...
随机推荐
- UnQLite简介
UnQLite是,由Symisc Systems公司出品的一个嵌入式C语言软件库,它实现了一个自包含.无服务器.零配置.事务化的NoSQL数据库引擎.UnQLite是一个文档存储数据库,类似于Mong ...
- PHP实现日志写入log.txt
引言:有时候调试,看不到效果,需要通过写入文件来实现. 案例: <?php $myfile = fopen("log.txt", "a+") or die ...
- postgresql 9.5 pgpool 主从复制 以及错误解决
PostgreSQL+pgpool-II复制方案 这里不做功能的描述,只写搭建的过程和遇到的一些问题 1 系统 [root@mysqlhq ~]# cat /etc/redhat-release Ky ...
- PHP开发环境正确的错误信息处理
正确记录配置 php.ini display_errors = On error_reporting = E_ALL log_errors = On error_log = F:/data/php/e ...
- 预编译头文件来自编译器的早期版本,或者预编译头为 C++ 而在 C 中使用它(或相反)转
vs2010的mfc项目中编译c语言出现错误: "...预编译头文件来自编译器的早期版本,或者预编译头为 C++ 而在 C 中使用它(或相反)" 解决方法: 建工程时 建立空项目 ...
- spring注解注入属性
- IP地址分类和子网划分
IP地址: 地址范围 网络地址规律 子网掩码 私有地址 保留地址 A类地址:从1.0.0.0 到1 ...
- Python小代码_15_遍历指定路径下的所有文件和文件夹,并格式化输出文件路径文件名和文件夹名,文件大小,修改时间
遍历指定路径下的所有文件和文件夹,并格式化输出文件路径文件名和文件夹名,文件大小,修改时间 import osimport datetime def print_tree(dir_path): for ...
- imp导入数据的时候报错:ORA-01658: 无法为表空间 MAXDATA 中的段创建 INITIAL 区
在oracle里创建表,报出错: ORA-01658: 无法为表空间space中的段创建 INITIAL 区: 或者: ORA-01658: unable to create INITIAL exte ...
- solr facet查询及solrj 读取facet数据(相当有用)
原文出自:http://www.coin163.com/java/docs/201310/d_3010029802.html 一. Facet 简介 Facet 是 solr 的高级搜索功能之一 ...