libreoffice python 操作word及excel文档
1、开始、关闭libreoffice服务;
开始之前同步字体文件时间,是因为创建soffice服务时,服务会检查所需加载的文件的时间,如果其认为时间不符,则其可能会重新加载,耗时较长,因此需事先统一时间。
使用时如果需要多次调用,最后每次调用均开启后关闭,否则libreoffice会创建一个缓存文档并越用越大,处理时间会增加。
class OfficeProcess(object):
def __init__(self):
self.p = 0
subprocess.Popen('find /usr/share/fonts | xargs touch -m -t 201801010000.00', shell=True) def start_office(self):
self.p = subprocess.Popen('soffice --pidfile=sof.pid --invisible --accept="socket,host=localhost,port=2002;urp;"', shell=True)
while True:
try:
local_context = uno.getComponentContext()
resolver = local_context.getServiceManager().createInstanceWithContext('com.sun.star.bridge.UnoUrlResolver', local_context)
resolver.resolve('uno:socket,host=localhost,port=2002;urp;StarOffice.ComponentContext')
return
except:
print(ts(), "wait for connecting soffice...")
time.sleep(1)
continue def stop_office(self):
with open("sof.pid", "rb") as f:
try:
os.kill(int(f.read()), signal.SIGTERM)
self.p.wait()
except:
pass
2、init service manager
local_context = uno.getComponentContext()
service_manager = local_context.getServiceManager()
resolver = service_manager.createInstanceWithContext('com.sun.star.bridge.UnoUrlResolver', local_context)
self.ctx = resolver.resolve('uno:socket,host=localhost,port=2002;urp;StarOffice.ComponentContext')
self.smgr = self.ctx.ServiceManager
self.desktop = self.smgr.createInstanceWithContext('com.sun.star.frame.Desktop', self.ctx)
3、从二进制数据中读取doc文档
def ImportFromMemory(self, data):
istream = self.smgr.createInstanceWithContext('com.sun.star.io.SequenceInputStream', self.ctx)
istream.initialize((uno.ByteSequence(data), ))
pv = PropertyValue()
pv.Name = 'InputStream'
pv.Value = istream
self.doc = {'doc': []}
try:
self.document = self.desktop.loadComponentFromURL('private:stream/swriter', '_blank', 0, (pv, ))
self.text = self.document.getText()
except:
self.text = None
4、读取doc文档中的数据
def ExportToJson(self):
try:
l = self.__ParseText(self.text, self.__Callback(self.doc['doc']))
self.doc['length'] = l
except:
self.doc = {'doc': [], 'length': 0}
return json.dumps(self.doc) @staticmethod
def __Callback(alist):
def Append(sth):
alist.append(sth)
return Append
def __ParseText(self, text, func):
l = 0
text_it = text.createEnumeration()
while text_it.hasMoreElements():
element = text_it.nextElement()
if element.supportsService('com.sun.star.text.Paragraph'):
l += self.__ParseParagraph(element, func)
elif element.supportsService('com.sun.star.text.TextTable'):
l += self.__ParseTable(element, func)
else:
pass
return l
def __ParseParagraph(self, paragraph, func):
p = {'paragraph': []}
l = 0
paragraph_it = paragraph.createEnumeration()
while paragraph_it.hasMoreElements():
portion = paragraph_it.nextElement()
if portion.TextPortionType == 'Text':
l += self.__ParsePortionText(portion, self.__Callback(p['paragraph']))
elif portion.TextPortionType == 'SoftPageBreak':
pass
elif portion.TextPortionType == 'TextField':
l += self.__ParsePortionText(portion, self.__Callback(p['paragraph']))
else:
l += self.__ParseTextContent(portion, self.__Callback(p['paragraph']))
if hasattr(paragraph, 'createContentEnumeration'):
l += self.__ParseTextContent(paragraph, self.__Callback(p['paragraph']))
p['length'] = l
func(p)
return l def __ParseTextContent(self, textcontent, func):
l = 0
content_it = textcontent.createContentEnumeration('com.sun.star.text.TextContent')
while content_it.hasMoreElements():
element = content_it.nextElement()
if element.supportsService('com.sun.star.text.TextGraphicObject'):
l += self.__ParsePortionGraphic(element, func)
elif element.supportsService('com.sun.star.text.TextEmbeddedObject'):
pass
elif element.supportsService('com.sun.star.text.TextFrame'):
l += self.__ParseFrame(element, func)
elif element.supportsService('com.sun.star.drawing.GroupShape'):
l += self.__ParseGroup(element, func)
else:
pass
return l def __ParseFrame(self, frame, func):
f = {'frame': []}
l = self.__ParseText(frame.getText(), self.__Callback(f['frame']))
f['length'] = l
func(f)
return l def __ParseGroup(self, group, func):
l = 0
for i in range(group.getCount()):
it = group.getByIndex(i)
if it.supportsService('com.sun.star.drawing.Text'):
l += self.__ParseFrame(it, func)
else:
pass
return l def __ParsePortionText(self, portion_text, func):
func({'portion': portion_text.String, 'length': len(portion_text.String)})
return len(portion_text.String) def __ParsePortionGraphic(self, portion_graphic, func):
gp = self.smgr.createInstanceWithContext('com.sun.star.graphic.GraphicProvider', self.ctx)
stream = self.smgr.createInstanceWithContext('com.sun.star.io.TempFile', self.ctx)
pv1 = PropertyValue()
pv1.Name = 'OutputStream'
pv1.Value = stream
pv2 = PropertyValue()
pv2.Name = 'MimeType'
pv2.Value = 'image/png'
gp.storeGraphic(portion_graphic.Graphic, (pv1, pv2))
stream.getOutputStream().flush()
stream.seek(0)
l = stream.getInputStream().available()
b = uno.ByteSequence(b'')
stream.seek(0)
l, b = stream.getInputStream().readBytes(b, l)
img = {'image': base64.b64encode(b.value).decode('ascii')}
img['height'] = portion_graphic.Height
img['width'] = portion_graphic.Width
img['actualheight'] = portion_graphic.ActualSize.Height
img['actualwidth'] = portion_graphic.ActualSize.Width
img['croptop'] = portion_graphic.GraphicCrop.Top
img['cropbottom'] = portion_graphic.GraphicCrop.Bottom
img['cropleft'] = portion_graphic.GraphicCrop.Left
img['cropright'] = portion_graphic.GraphicCrop.Right
img['length'] = 0
func(img)
return 0 def __ParseTable(self, table, func):
l = 0
try:
matrix = self.__GetTableMatrix(table)
seps = self.__GetTableSeparators(table)
t = {}
count = 0
for ri in matrix.keys():
t[ri] = {}
for ci in matrix[ri].keys():
t[ri][ci] = dict(matrix[ri][ci])
del t[ri][ci]['cell']
t[ri][ci]['content'] = []
l += self.__ParseText(matrix[ri][ci]['cell'], self.__Callback(t[ri][ci]['content']))
count += t[ri][ci]['rowspan'] * t[ri][ci]['colspan']
if count != len(t) * len(seps):
raise ValueError('count of cells error')
func({'table': t, 'row': len(t), 'column': len(seps), 'length': l, 'tableid': self.table_id})
self.table_id += 1
except:
l = 0
print('discard wrong table')
return l @staticmethod
def __GetTableSeparators(table):
result = [table.TableColumnRelativeSum]
for ri in range(table.getRows().getCount()):
result += [s.Position for s in table.getRows().getByIndex(ri).TableColumnSeparators]
result = sorted(set(result))
for i in range(len(result) - 1):
result[i] += 1 if result[i] + 1 == result[i + 1] else 0
return sorted(set(result)) @staticmethod
def __NameToRC(name):
r = int(re.sub('[A-Za-z]', '', name)) - 1
cstr = re.sub('[0-9]', '', name)
c = 0
for i in range(len(cstr)):
if cstr[i] >= 'A' and cstr[i] <= 'Z':
c = c * 52 + ord(cstr[i]) - ord('A')
else:
c = c * 52 + 26 + ord(cstr[i]) - ord('a')
return r, c @staticmethod
def __GetTableMatrix(table):
result = {}
for name in table.getCellNames():
ri, ci = WordToJson.__NameToRC(name)
cell = table.getCellByName(name)
if ri not in result:
result[ri] = {}
result[ri][ci] = {'cell': cell, 'rowspan': cell.RowSpan, 'name': name} seps = WordToJson.__GetTableSeparators(table)
for ri in result.keys():
sep = [s.Position for s in table.getRows().getByIndex(ri).TableColumnSeparators] + [table.TableColumnRelativeSum]
sep = sorted(set(sep))
for ci in result[ri].keys():
right = seps.index(sep[ci]) if sep[ci] in seps else seps.index(sep[ci] + 1)
left = -1 if ci == 0 else seps.index(sep[ci - 1]) if sep[ci - 1] in seps else seps.index(sep[ci - 1] + 1)
result[ri][ci]['colspan'] = right - left
return result
5、写doc文档
self.doco = self.desktop.loadComponentFromURL('private:factory/swriter', '_blank', 0, ())
self.texto = self.doco.getText()
self.cursoro = self.texto.createTextCursor()
self.cursoro.ParaBottomMargin = 500
def __WriteText(self, text, texto, cursoro):
for it in text:
if 'paragraph' in it:
self.__WriteParagraph(it, texto, cursoro)
elif 'image' in it:
self.__WritePortionGraphic(it, texto, cursoro)
elif 'table' in it:
self.__WriteTable(it, texto, cursoro) def __WriteParagraph(self, paragraph, texto, cursoro):
if paragraph['length'] > 0:
if 'result' in paragraph:
for it in paragraph['result']:
texto.insertString(cursoro, it['trans_sen'], False)
else:
texto.insertString(cursoro, paragraph['paragraph'], False)
texto.insertControlCharacter(cursoro, ControlCharacter.PARAGRAPH_BREAK, False) def __WritePortionGraphic(self, portion_graphic, texto, cursoro):
png_base64 = portion_graphic['image']
png = base64.b64decode(png_base64)
gp = self.smgr.createInstanceWithContext('com.sun.star.graphic.GraphicProvider', self.ctx)
istream = self.smgr.createInstanceWithContext('com.sun.star.io.SequenceInputStream', self.ctx)
istream.initialize((uno.ByteSequence(png), ))
pv = PropertyValue()
pv.Name = 'InputStream'
pv.Value = istream actualsize = uno.createUnoStruct('com.sun.star.awt.Size')
actualsize.Height = portion_graphic['actualheight'] if 'actualheight' in portion_graphic else portion_graphic['height']
actualsize.Width = portion_graphic['actualwidth'] if 'actualwidth' in portion_graphic else portion_graphic['width']
graphiccrop = uno.createUnoStruct('com.sun.star.text.GraphicCrop')
graphiccrop.Top = portion_graphic['croptop'] if 'croptop' in portion_graphic else 0
graphiccrop.Bottom = portion_graphic['cropbottom'] if 'cropbottom' in portion_graphic else 0
graphiccrop.Left = portion_graphic['cropleft'] if 'cropleft' in portion_graphic else 0
graphiccrop.Right = portion_graphic['cropright'] if 'cropright' in portion_graphic else 0 image = self.doco.createInstance('com.sun.star.text.TextGraphicObject')
image.Surround = NONE
image.Graphic = gp.queryGraphic((pv, ))
image.Height = portion_graphic['height']
image.Width = portion_graphic['width']
image.setPropertyValue('ActualSize', actualsize)
image.setPropertyValue('GraphicCrop', graphiccrop)
texto.insertTextContent(cursoro, image, False)
texto.insertControlCharacter(cursoro, ControlCharacter.PARAGRAPH_BREAK, False) def __WriteTable(self, table, texto, cursoro):
tableo = self.doco.createInstance('com.sun.star.text.TextTable')
tableo.initialize(table['row'], table['column'])
texto.insertTextContent(cursoro, tableo, False)
# texto.insertControlCharacter(cursoro, ControlCharacter.PARAGRAPH_BREAK, False)
tcursoro = tableo.createCursorByCellName("A1") hitbug = False
if table['row'] > 1:
tcursoro.goDown(1, True)
hitbug = tcursoro.getRangeName() == 'A1' for ri in sorted([int(r) for r in table['table'].keys()]):
rs = table['table'][str(ri)]
for ci in sorted([int(c) for c in rs.keys()]):
cell = rs[str(ci)]
if hitbug == False and (cell['rowspan'] > 1 or cell['colspan'] > 1):
tcursoro.gotoCellByName(cell['name'], False)
if cell['rowspan'] > 1:
tcursoro.goDown(cell['rowspan'] - 1, True)
if cell['colspan'] > 1:
tcursoro.goRight(cell['colspan'] - 1, True)
tcursoro.mergeRange()
ctexto = tableo.getCellByName(cell['name'])
if ctexto == None:
continue
ccursoro = ctexto.createTextCursor()
ccursoro.CharWeight = FontWeight.NORMAL
ccursoro.CharWeightAsian = FontWeight.NORMAL
ccursoro.ParaAdjust = LEFT
self.__WriteText(cell['content'], ctexto, ccursoro)
6、生成二进制的doc文档数据
streamo = self.smgr.createInstanceWithContext('com.sun.star.io.Pipe', self.ctx)
self.doco.storeToURL('private:stream', (PropertyValue('FilterName', 0, 'MS Word 2007 XML', 0), PropertyValue('OutputStream', 0, streamo, 0)))
streamo.flush()
_, datao = streamo.readBytes(None, streamo.available())
7、从doc文档数据生成pdf的二进制数据
streamo = self.smgr.createInstanceWithContext('com.sun.star.io.Pipe', self.ctx)
self.doco.storeToURL('private:stream', (PropertyValue('FilterName', 0, 'writer_pdf_Export', 0), PropertyValue('OutputStream', 0, streamo, 0)))
streamo.flush()
_, datap = streamo.readBytes(None, streamo.available())
8、读取excel二进制数据
def ImportFromMemory(self, data):
istream = self.smgr.createInstanceWithContext('com.sun.star.io.SequenceInputStream', self.ctx)
istream.initialize((uno.ByteSequence(data), ))
pv = PropertyValue()
pv.Name = 'InputStream'
pv.Value = istream
self.doc = {'doc': []}
try:
print("before loadComponentFromURL")
self.document = self.desktop.loadComponentFromURL('private:stream/scalc', '_blank', 0, (pv, ))
self.sheets = self.document.getSheets()
print("ImportFromMemory done")
except:
print("ImportFromMemory failed")
self.sheets = None
9、读取excel的文本数据
def ExportToJson(self):
try:
l = self.__ParseText(self.sheets, self.__Callback(self.doc['doc']))
self.doc['length'] = l
except:
self.doc = {'doc': [], 'length': 0}
return json.dumps(self.doc)
def __ParseText(self, sheets, func):
l = 0
sheets_it = sheets.createEnumeration()
while sheets_it.hasMoreElements():
element = sheets_it.nextElement()
if element.supportsService('com.sun.star.sheet.Spreadsheet'):
l += self.__ParseSpreadsheet(element, func)
return l def __ParseSpreadsheet(self, spreadsheet, func):
l = 0
p = {'spreadsheet': []}
visible_cells_it = spreadsheet.queryVisibleCells().getCells().createEnumeration()
while visible_cells_it.hasMoreElements():
cell = visible_cells_it.nextElement()
type = cell.getType()
if type == self.EMPTY:
print("cell.type==empty")
elif type == self.VALUE:
print("cell.type==VALUE", "value=", cell.getValue(), cell.getCellAddress ())
elif type == self.TEXT:
print("cell.type==TEXT","content=", cell.getString().encode("UTF-8"), cell.getCellAddress ())
l += self.__ParseCellText(spreadsheet, cell, self.__Callback(p['spreadsheet']))
print("__ParseCellText=", p)
elif type == self.FORMULA:
print("cell.type==FORMULA", "formula=", cell.getValue())
p['length'] = l
func(p)
return l def __ParseCellText(self, sheet, cell, func):
try:
x = cell.getCellAddress().Column
y = cell.getCellAddress().Row
sheetname = sheet.getName()
except:
x = -1
y = -1
sheetname = None
func({'celltext': cell.getString(), 'x': x, 'y': y, 'sheetname': sheetname, 'length': len(cell.getString())})
return len(cell.getString())
self.EMPTY = uno.Enum("com.sun.star.table.CellContentType", "EMPTY")
self.TEXT = uno.Enum("com.sun.star.table.CellContentType", "TEXT")
self.FORMULA = uno.Enum("com.sun.star.table.CellContentType", "FORMULA")
self.VALUE = uno.Enum("com.sun.star.table.CellContentType", "VALUE")
10、替换excel的文本信息
def ImportFromJson(self, data):
doc = json.loads(data)
try:
self.__WriteText(doc['doc'])
except:
pass
def __WriteText(self, text):
print("__WriteText begin:", text)
sheet = None
for it in text:
if 'paragraph' in it and 'sheetname' in it:
if sheet == None or sheet.getName() != it['sheetname']:
try:
sheet = self.sheets.getByName(it['sheetname'])
print("getsheet:", it['sheetname'], "=", sheet.getName())
except:
sheet = None
continue
self.__WriteParagraph(it, sheet) def __WriteParagraph(self, paragraph, sheet):
print("__WriteParagraph")
if paragraph['length'] > 0:
try:
x = paragraph['x']
y = paragraph['y']
print("getcell:", x, y)
cell = sheet.getCellByPosition(x, y)
print("getcell done")
except:
return
if 'result' in paragraph:
for it in paragraph['result']:
print("cell=", cell.getString())
cell.setString(it['trans_sen'])
print("cell,", cell.getString(), ",done")
11、生成excel文档二进制数据
streamo = self.smgr.createInstanceWithContext('com.sun.star.io.Pipe', self.ctx)
self.document.storeToURL('private:stream', (PropertyValue('FilterName', 0, 'Calc MS Excel 2007 XML', 0), PropertyValue('OutputStream', 0, streamo, 0)))
streamo.flush()
_, datao = streamo.readBytes(None, streamo.available())
12、生成excel的pdf文档
streamo = self.smgr.createInstanceWithContext('com.sun.star.io.Pipe', self.ctx)
self.document.storeToURL('private:stream', (PropertyValue('FilterName', 0, 'calc_pdf_Export', 0), PropertyValue('OutputStream', 0, streamo, 0)))
streamo.flush()
_, datap = streamo.readBytes(None, streamo.available())
libreoffice python 操作word及excel文档的更多相关文章
- 基于DevExpress实现对PDF、Word、Excel文档的预览及操作处理
http://www.cnblogs.com/wuhuacong/p/4175266.html 在一般的管理系统模块里面,越来越多的设计到一些常用文档的上传保存操作,其中如PDF.Word.Excel ...
- Aspose.Words操作word生成PDF文档
Aspose.Words操作word生成PDF文档 using Aspose.Words; using System; using System.Collections.Generic; using ...
- word ppt excel文档转换成pdf
1.把word文档转换成pdf (1).添加引用 using Microsoft.Office.Interop.Word; 添加引用 (2).转换方法 /// <summary> /// ...
- C#/VB.NET: 将Word或Excel文档转化为Text
Text文件只由纯文本内容组成,且没有格式,所以其大小比Word或Excel文件更小.除此之外,Text文件还具有跨平台性,几乎与所有应用程序都兼容.因此,在某些时候,我们可能需要将Word或Exce ...
- Python操作Word与Excel并打包
安装模块 # Word操作库 pip install docx # Excel操作库 pip install openpyxl # 打包exe工具 pip install pyinstaller Wo ...
- php 如何写入、读取word,excel文档
如何在php写入.读取word文档 <? //如何在php写入.读取word文档 // 建立一个指向新COM组件的索引 $word = new COM("word.applicatio ...
- 使用NOPI读取Word、Excel文档内容
使用NOPI读取Excel的例子很多,读取Word的例子不多. Excel的解析方式有多中,可以使用ODBC查询,把Excel作为一个数据集对待.也可以使用文档结构模型的方式进行解析,即解析Workb ...
- Python比较两个excel文档内容的异同
#-*- coding: utf-8 -*- #比对两个Excel文件内容的差异#---------------------假设条件----------------#1.源表和目标表格式一致#2.不存 ...
- 在线预览word,excel文档
Google Doc 示例:https://jsfiddle.net/7xr419yb/ Microsoft Office 示例:https://jsfiddle.net/gcuzq343/
随机推荐
- 动态规划状态压缩-poj1143
题目链接:http://poj.org/problem?id=1143 题目描述: 代码实现: #include <iostream> #include <string.h> ...
- 【python】函数式编程
No1: 函数式编程:即函数可以作为参数传递,也可以作为返回值 No2: map()函数接收两个参数,一个是函数,一个是Iterable,map将传入的函数依次作用到序列的每个元素,并把结果作为新的 ...
- Square Destroyer-POJ 1084 (IDA*)
Description The left figure below shows a complete 3*3 grid made with 2*(3*4) (=24) matchsticks. The ...
- notepad++ 注释
在用notepad++进行代码编辑的过程中,总感觉还是有keil那样可以进行多行注释的快捷方式方便,其实notepad++也可以进行单行.多行.区块注释和取消注释的....... 快捷键如下: 单行. ...
- Unity容器中AOP应用示例程序
转发请注明出处:https://www.cnblogs.com/zhiyong-ITNote/p/9127001.html 实在没有找到Unity容器的AOP应用程序示例的说明,在微软官网找到了教程( ...
- Alpha(8/10)
鐵鍋燉腯鱻 项目:小鱼记账 团队成员 项目燃尽图 冲刺情况描述 站立式会议照片 各成员情况 团队成员 学号 姓名 git地址 博客地址 031602240 许郁杨 (组长) https://githu ...
- MacOs brew 命令行安装常见工具
brew类似ubuntu系统下的apt-get的功能 安装方法: 在Mac中打开Termal: 输入命令: ruby -e "$(curl -fsSL https://raw.githu ...
- 【Excel】将IP按照IP地址(v4)增长序列排序
Background: Excel列中,有多个net-block, 将这些net-block按照IP地址(v4)自己的大小从小到大排序. Idea: IPv4地址的格式是点分十进制的,也就是说每一个点 ...
- 在Node.js使用Promise的方式操作Mysql
最近在学习Node.js,虽然早就听说了回调地狱结果过了一周就遇到了.所以花时间学习了了一下Promise.虽然还有Async/await.co.生成器等选择,但是因为本人基础较差,以及时间问题所以决 ...
- [数据库]Sqlite使用入门
官网的文档结构十分恶劣,大概翻了一下,提供入门指引. 0. sqlite的安装 根据自身情况,在官网下载32位/64位的dll文件以及sqlite-tools-win32-x86-3240000.zi ...