#!/usr/bin/env python
# -*- coding: utf-8 -*-
import ConfigParser
import json
import os
import re
from re import sub
import sys
import time
import requests
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfparser import PDFParser
from qiniu import Auth
from qiniu import etag
from qiniu import put_file
import log_config
from OP_Mysql import get_connection
from HTMLParser import HTMLParser
import random reload(sys)
sys.setdefaultencoding('utf-8')
logger = log_config.getlogger('analysis_neeq_content', 'analysis_neeq_content.log')
conf = ConfigParser.ConfigParser()
conf.read("mysql.conf")
neeq_remainder = conf.get("basic_config", "neeq_remainder")
neeq_server_num = conf.get("basic_config", "neeq_server_num")
neeq_start_id = conf.get("basic_config", "neeq_start_id")
neeq_json_path = conf.get("basic_config", "neeq_json_path")
neeq_json = conf.get("basic_config", "neeq_json")
json_suffix = '.json'
neeq_id = conf.get("basic_config", "neeq_id")
neeq_file_path = conf.get("basic_config", "neeq_file_path")
access_key = conf.get("basic_config", "access_key")
secret_key = conf.get("basic_config", "secret_key")
bucket = conf.get("basic_config", "bucket") class analysis:
def __init__(self):
# 用于文件追加
self.count = 0
self.neeq_json = neeq_json
self.headers = {'Host': 'www.neeq.com.cn',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36'
}
self.create_init_dirtory() # 创建初始文件夹
def create_init_dirtory(self):
if not os.path.exists(neeq_json_path):
os.makedirs(neeq_json_path)
if not os.path.exists(neeq_file_path):
os.makedirs(neeq_file_path) # mysql 获取数据
def get_data(self):
with get_connection() as db:
# SQL 查询语句
count = r"SELECT COUNT(*) as num FROM ssb_insight_neeq WHERE pro_status = 0 AND neeq_id <= %s and %s = (neeq_id %% %s)"
logger.info("now excute sql script sql = %s" % count)
try:
# 获取所有记录列表
db.cursor.execute(count, [neeq_start_id, neeq_remainder, neeq_server_num])
counts = db.cursor.fetchall()
num = counts[0]['num']
logger.info("now rows num = %s" % num)
if 0 != num % 1000:
pages = num / 1000 + 1
else:
pages = num / 1000
start_rows = 1000
for i in range(0, pages):
start_page = i * 1000 sql = "SELECT t.sec_code,t.sec_name,t.title,t.doc_type,t.doc_type_key,c.industry1,c.industry2," \
"t.url,t.public_time,t.content,t.pro_status,t.module,t.es_id FROM ssb_insight_neeq t " \
"LEFT JOIN ssb_d_listed_company c ON t.sec_code = c.secCode WHERE t.pro_status = 0 and t.neeq_id <= %s " \
"AND %s = (t.neeq_id %% %s) ORDER BY t.neeq_id DESC LIMIT %s ,%s" db.cursor.execute(sql, [neeq_start_id, neeq_remainder, neeq_server_num, start_page, start_rows])
result_datas = db.cursor.fetchall()
# 1000 数据放入此数组
json_data = []
es_id_file_addr = []
for row in result_datas:
item = {}
es_obj = {}
result = {'secCode': row['sec_code'],
'secName': row['sec_name'],
'title': row['title'],
'docType': row['doc_type'].split(','),
'docTypeKey': row['doc_type_key'].split(','),
'url': row['url'],
'publicTime': row['public_time'],
'industry1': row['industry1'],
'industry2': row['industry2'],
'content': row['content'],
'proStatus': bool(row['pro_status']),
'module': row['module'],
}
file_url = row['url']
self.download_file(file_url)
file_name = re.findall(r".*/(.*)", file_url)[0]
file_paths = neeq_file_path + file_name
if os.path.exists(file_paths):
content = self.analysis_file_content(file_paths)
self.upload_qiniu(file_paths)
self.del_file(file_paths)
if content == '':
continue
result['content'] = content else:
logger.warn("file_url %s download fail" % file_url)
continue
item['id'] = row['es_id']
item['data'] = result
json_data.append(item)
es_obj['es_id'] = row['es_id']
es_obj['file_addr'] = file_paths
es_id_file_addr.append(es_obj)
self.write_json_file(json_data)
self.write_es_id_file_addr(es_id_file_addr)
except Exception as e:
logger.error("Error: unable to fecth data Exception %s" % e) def write_json_file(self, json_data):
# 写数据
json_path = neeq_json_path + self.neeq_json + json_suffix
rows = self.get_json_rows(json_path)
if rows > 100000:
self.count = self.count + 1
self.neeq_json = neeq_json + str(self.count)
json_path = neeq_json_path + self.neeq_json + json_suffix
with open(json_path, 'a') as es_file:
for jsonitem in json_data:
jsondatar = json.dumps(jsonitem, ensure_ascii=True)
es_file.write(jsondatar+"\n") def write_es_id_file_addr(self, es_id_data):
# 写入es_id,以及 七牛云 地址
with open(neeq_id, 'a') as es_id_file:
for jsonitem in es_id_data:
es_id_file.write(jsonitem['es_id']+","+jsonitem['file_addr']+";"+"\n") # 获取json文件行数,用于分文件存储
def get_json_rows(self, json_path):
count = 0
if not os.path.exists(json_path):
return 0
thefile = open(json_path, 'rb')
while True:
buffer = thefile.read(8192 * 1024)
if not buffer:
break
count += buffer.count('\n')
thefile.close()
return count # 上传文件
def upload_qiniu(self, file_path_name):
q = Auth(access_key, secret_key)
# 生成上传 Token,可以指定过期时间等
token = q.upload_token(bucket, file_path_name, 3600)
# 要上传文件的本地路径
ret, info = put_file(token, file_path_name, file_path_name)
# logger.info(info)
if info.status_code != 200:
logger.info("file upload qiniuyun fail %s" % file_path_name) # 删除文件
def del_file(self, file_path_name):
if os.path.exists(file_path_name):
os.remove(file_path_name)
else:
logger.info("%s 文件不存在" % file_path_name) # 下载文件
def download_file(self, file_url):
time.sleep(random.uniform(1, 2))
retry = 0
try:
while retry < 3:
file_name = re.findall(r".*/(.*)", file_url)[0]
response = requests.get(file_url, stream=True, headers=self.headers, timeout=5)
if response.status_code == requests.codes.ok:
with open(neeq_file_path + file_name, "wb") as code:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
code.write(chunk)
break
except Exception as e:
logger.exception(e)
retry += 1 # 解析文件
def analysis_file_content(self, filename):
content = ''
fenzhihouzhui = re.findall(r'.*(\..*)', str(filename))[0]
if fenzhihouzhui == '.pdf' or fenzhihouzhui == '.PDF':
content = self.analysis_pdf_file_content(filename)
elif fenzhihouzhui == '.html' or fenzhihouzhui == '.HTML':
content = self.analysi_html_file_content(filename)
return content def analysis_pdf_file_content(self, filename):
content = ''
try:
fileobject = open(filename, 'rb')
parser = PDFParser(fileobject)
document = PDFDocument(parser)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
else:
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layout = device.get_result()
for x in layout:
if isinstance(x, LTTextBoxHorizontal):
results = x.get_text().encode('utf-8')
content += results
fileobject.close()
except Exception as e:
logger.error("analysis pdf file fail : %s" % e)
return content def analysi_html_file_content(self, filename):
content_open = open(filename, 'rb')
contents = content_open.read()
print contents
contents = dehtml(contents) class pythonNToTxt(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.__text = [] def handle_data(self, data):
text = data.strip()
if len(text) > 0:
text = sub('[ \t\r\n]+', ' ', text)
self.__text.append(text + ' ') def handle_starttag(self, tag, attrs):
if tag == 'p':
self.__text.append('\n\n')
elif tag == 'br':
self.__text.append('\n') def handle_startendtag(self, tag, attrs):
if tag == 'br':
self.__text.append('\n\n') def text(self):
return ''.join(self.__text).strip() def dehtml(text):
try:
parser = pythonNToTxt()
parser.feed(text)
parser.close()
return parser.text()
except Exception as e:
logger.error("html analysis excepiton : %s" % e)
return text logger.info("analysis neeq content start,now params neeq_remainder=%s,neeq_start_id =%s,neeq_json = %s,neeq_id = %s ,neeq_file_path = %s" % (neeq_remainder, neeq_start_id, neeq_json, neeq_id, neeq_file_path))
analysis = analysis()
analysis.get_data()
#!/usr/bin/env python
# -*- coding: utf-8 -*
import sys
import log_config
import ConfigParser
import pymysql
from DBUtils.PooledDB import PooledDB
reload(sys)
sys.setdefaultencoding('utf-8')
conf = ConfigParser.ConfigParser()
conf.read("mysql.conf")
user = conf.get("mysql", "user")
password = conf.get("mysql", "password")
database = conf.get("mysql", "database")
host = conf.get("mysql", "host")
port = conf.get("mysql", "port")
charset = "utf8" class OPMysql(object):
__pool = None def __init__(self):
# 构造函数,创建数据库连接、游标
pass def __enter__(self):
self.conn = self.getmysqlconn()
self.cursor = self.conn.cursor(cursor=pymysql.cursors.DictCursor)
return self def __exit__(self, typeq, value, trace):
self.cursor.close()
self.conn.close() # 数据库连接池连接
@staticmethod
def getmysqlconn():
if OPMysql.__pool is None:
__pool_a = PooledDB(creator=pymysql, mincached=1, maxcached=10, host=host, user=user, passwd=password, db=database, port=int(port), charset=charset)
OPMysql.__pool = __pool_a
return OPMysql.__pool.connection() def get_connection():
return OPMysql()

日志模块在前面随笔中

#------mysql basic config
[mysql]
user=用户名
password=密码
database=数据库
host=你的mysqlIp
port =3306 [basic_config]
#---------------neeq config
#余数为0
neeq_remainder = 0
#服务器台数
neeq_server_num = 6
neeq_start_id = 1000
neeq_json_path = neeq/json/
neeq_json = neeq
neeq_id = neeq/neeq_id.txt
neeq_file_path = neeq/file/
bucket = 七牛云bucket
access_key =你的七牛云access_key
secret_key = 你的七牛云secret_key

python 读取mysql存储的文件路径下载文件,内容解析,上传七牛云,内容入es的更多相关文章

  1. JAVA中实现根据文件路径下载文件

    import javax.servlet.http.HttpServletResponse; import java.io.File; import java.io.FileInputStream; ...

  2. 由ASP.NET Core根据路径下载文件异常引发的探究

    前言 最近在开发新的项目,使用的是ASP.NET Core6.0版本的框架.由于项目中存在文件下载功能,没有使用类似MinIO或OSS之类的分布式文件系统,而是下载本地文件,也就是根据本地文件路径进行 ...

  3. Delphi阿里云对象存储OSS【支持上传文件、下载文件、删除文件、创建目录、删除目录、Bucket操作等】

    作者QQ:(648437169) 点击下载➨Delphi阿里云对象存储OSS             阿里云api文档 [Delphi阿里云对象存储OSS]支持 获取Bucket列表.设置Bucket ...

  4. js上传文件带参数,并且,返回给前台文件路径,解析上传的xml文件,存储到数据库中

    ajaxfileupload.js jQuery.extend({ createUploadIframe: function(id, uri) { //create frame var frameId ...

  5. 【python】用python脚本Paramiko实现远程执行命令、下载、推送/上传文件功能

    Paramiko: paramiko模块,基于SSH用于连接远程服务器并执行相关操作. SSHClient: 用于连接远程服务器并执行基本命令 SFTPClient: 用于连接远程服务器并执行上传下载 ...

  6. 文件上传和下载(可批量上传)——Spring(二)

    针对SpringMVC的文件上传和下载.下载用之前“文件上传和下载——基础(一)”的依然可以,但是上传功能要修改,这是因为springMVC 都为我们封装好成自己的文件对象了,转换的过程就在我们所配置 ...

  7. 基于SpringMVC的文件(增删改查)上传、下载、更新、删除

    一.项目背景 摘要:最近一直在忙着项目的事,3个项目过去了,发现有一个共同的业务,那就是附件的处理,附件包括各种文档,当然还有图片等特殊文件,由于时间的关系,每次都是匆匆忙忙的搞定上线,称这项目的空档 ...

  8. 微信小程序开发技巧总结(二) -- 文件的选取、移动、上传和下载

    微信小程序开发技巧总结(二) -- 文件的选取.移动.上传和下载 1.不同类型文件的选取 1.1 常用的图片 视频 对于大部分开发者来说,需要上传的文件形式主要为图片,微信为此提供了接口. wx.ch ...

  9. 文件上传和下载(可批量上传)——Spring(三)

    在文件上传和下载(可批量上传)——Spring(二)的基础上,发现了文件下载时,只有在Chrome浏览器下文件名正常显示,还有发布到服务器后,不能上传到指定的文件夹目录,如上传20160310.txt ...

随机推荐

  1. phpcms文档

    http://www.phpcms.cn/doc/PHPCMSDocumentor/cache_module.html http://www.cnblogs.com/Braveliu/p/507493 ...

  2. YUV

    https://msdn.microsoft.com/en-us/library/aa904813(VS.80).aspx

  3. C# byte数组转成Bitmap对象

    方法一: /// <summary> /// 将数组转换成彩色图片 /// </summary> /// <param name="rawValues" ...

  4. npm笔记和bower

    生成package.json文件的方式就是dos下进入该文件夹,然后执行 npm init Bower简单点儿说就是通过nodejs直接下载GitHub上的js源码 首先你得有node,这里就不多做介 ...

  5. 小程序mina框架与配置

    小程序是采用MINA框架 <!--demo.wxml--> <view> Hello {{name}}</view> <button bindtap=&quo ...

  6. verilog 建模笔记--低级建模

    来源  <verilog HDL那些事--建模篇> 1.并行建模的思想. 2.每个模块最好只有一个功能.(便于修改和扩展,特别在大的项目中) 典型的 HDL 教科书中,才不会要读者了解“模 ...

  7. win10系统怎么关闭自动更新

    现在win10已经很普遍了,对于win10 现在还不是很完美,比如自动更新功能,现在的选项中没有关闭自动更新的选项了,这是一个bug,微软要强制更新.我就忍受不了自动更新,会拉取网络,影响我们的上网体 ...

  8. Java之Object类与instanceof关键字

    Object类是所有类的父类: 我们上下代码: package com.learn.chap03.sec14; public class A { // 上面类A继承了Object类,因此又可这样定义: ...

  9. NPOI-WebForm_Excel导入与导出

    本文面对的是第一次 接触NPOI的童鞋 不必为了一些琐碎的事情搞的心情烦躁 废话不多说先上 Demo 的全家福 接下来直接上代码 public partial class _Default : Sys ...

  10. 【原】Coursera—Andrew Ng机器学习—课程笔记 Lecture 5 Octave Tutorial—5.5 控制语句: for, while, if 语句

    5.5 控制语句: for, while, if 语句 参考视频: 5 - 5 - Control Statements_ for, while, if statements (13 min).mkv ...