libreoffice python 操作word及excel文档

1、开始、关闭libreoffice服务；

开始之前同步字体文件时间，是因为创建soffice服务时，服务会检查所需加载的文件的时间，如果其认为时间不符，则其可能会重新加载，耗时较长，因此需事先统一时间。

使用时如果需要多次调用，最后每次调用均开启后关闭，否则libreoffice会创建一个缓存文档并越用越大，处理时间会增加。

class OfficeProcess(object):

    def __init__(self):

        self.p = 0

        subprocess.Popen('find /usr/share/fonts | xargs touch -m -t 201801010000.00', shell=True)

    def start_office(self):

        self.p = subprocess.Popen('soffice --pidfile=sof.pid --invisible --accept="socket,host=localhost,port=2002;urp;"', shell=True)

        while True:

            try:

                local_context = uno.getComponentContext()

                resolver = local_context.getServiceManager().createInstanceWithContext('com.sun.star.bridge.UnoUrlResolver', local_context)

                resolver.resolve('uno:socket,host=localhost,port=2002;urp;StarOffice.ComponentContext')

                return

            except:

                print(ts(), "wait for connecting soffice...")

                time.sleep(1)

                continue

    def stop_office(self):

        with open("sof.pid", "rb") as f:

            try:

                os.kill(int(f.read()), signal.SIGTERM)

                self.p.wait()

            except:

                pass

2、init service manager

local_context = uno.getComponentContext()

        service_manager = local_context.getServiceManager()

        resolver = service_manager.createInstanceWithContext('com.sun.star.bridge.UnoUrlResolver', local_context)

        self.ctx = resolver.resolve('uno:socket,host=localhost,port=2002;urp;StarOffice.ComponentContext')

        self.smgr = self.ctx.ServiceManager

        self.desktop = self.smgr.createInstanceWithContext('com.sun.star.frame.Desktop', self.ctx)

3、从二进制数据中读取doc文档

def ImportFromMemory(self, data):

        istream = self.smgr.createInstanceWithContext('com.sun.star.io.SequenceInputStream', self.ctx)

        istream.initialize((uno.ByteSequence(data), ))

        pv = PropertyValue()

        pv.Name = 'InputStream'

        pv.Value = istream

        self.doc = {'doc': []}

        try:

            self.document = self.desktop.loadComponentFromURL('private:stream/swriter', '_blank', 0, (pv, ))

            self.text = self.document.getText()

        except:

            self.text = None

4、读取doc文档中的数据

def ExportToJson(self):

        try:

            l = self.__ParseText(self.text, self.__Callback(self.doc['doc']))

            self.doc['length'] = l

        except:

            self.doc = {'doc': [], 'length': 0}

        return json.dumps(self.doc)

@staticmethod

    def __Callback(alist):

        def Append(sth):

            alist.append(sth)

        return Append

def __ParseText(self, text, func):

        l = 0

        text_it = text.createEnumeration()

        while text_it.hasMoreElements():

            element = text_it.nextElement()

            if element.supportsService('com.sun.star.text.Paragraph'):

                l += self.__ParseParagraph(element, func)

            elif element.supportsService('com.sun.star.text.TextTable'):

                l += self.__ParseTable(element, func)

            else:

                pass

        return l

def __ParseParagraph(self, paragraph, func):

        p = {'paragraph': []}

        l = 0

        paragraph_it = paragraph.createEnumeration()

        while paragraph_it.hasMoreElements():

            portion = paragraph_it.nextElement()

            if portion.TextPortionType == 'Text':

                l += self.__ParsePortionText(portion, self.__Callback(p['paragraph']))

            elif portion.TextPortionType == 'SoftPageBreak':

                pass

            elif portion.TextPortionType == 'TextField':

                l += self.__ParsePortionText(portion, self.__Callback(p['paragraph']))

            else:

                l += self.__ParseTextContent(portion, self.__Callback(p['paragraph']))

        if hasattr(paragraph, 'createContentEnumeration'):

            l += self.__ParseTextContent(paragraph, self.__Callback(p['paragraph']))

        p['length'] = l

        func(p)

        return l

    def __ParseTextContent(self, textcontent, func):

        l = 0

        content_it = textcontent.createContentEnumeration('com.sun.star.text.TextContent')

        while content_it.hasMoreElements():

            element = content_it.nextElement()

            if element.supportsService('com.sun.star.text.TextGraphicObject'):

                l += self.__ParsePortionGraphic(element, func)

            elif element.supportsService('com.sun.star.text.TextEmbeddedObject'):

                pass

            elif element.supportsService('com.sun.star.text.TextFrame'):

                l += self.__ParseFrame(element, func)

            elif element.supportsService('com.sun.star.drawing.GroupShape'):

                l += self.__ParseGroup(element, func)

            else:

                pass

        return l

    def __ParseFrame(self, frame, func):

        f = {'frame': []}

        l = self.__ParseText(frame.getText(), self.__Callback(f['frame']))

        f['length'] = l

        func(f)

        return l

    def __ParseGroup(self, group, func):

        l = 0

        for i in range(group.getCount()):

            it = group.getByIndex(i)

            if it.supportsService('com.sun.star.drawing.Text'):

                l += self.__ParseFrame(it, func)

            else:

                pass

        return l

    def __ParsePortionText(self, portion_text, func):

        func({'portion': portion_text.String, 'length': len(portion_text.String)})

        return len(portion_text.String)

    def __ParsePortionGraphic(self, portion_graphic, func):

        gp = self.smgr.createInstanceWithContext('com.sun.star.graphic.GraphicProvider', self.ctx)

        stream = self.smgr.createInstanceWithContext('com.sun.star.io.TempFile', self.ctx)

        pv1 = PropertyValue()

        pv1.Name = 'OutputStream'

        pv1.Value = stream

        pv2 = PropertyValue()

        pv2.Name = 'MimeType'

        pv2.Value = 'image/png'

        gp.storeGraphic(portion_graphic.Graphic, (pv1, pv2))

        stream.getOutputStream().flush()

        stream.seek(0)

        l = stream.getInputStream().available()

        b = uno.ByteSequence(b'')

        stream.seek(0)

        l, b = stream.getInputStream().readBytes(b, l)

        img = {'image': base64.b64encode(b.value).decode('ascii')}

        img['height'] = portion_graphic.Height

        img['width'] = portion_graphic.Width

        img['actualheight'] = portion_graphic.ActualSize.Height

        img['actualwidth'] = portion_graphic.ActualSize.Width

        img['croptop'] = portion_graphic.GraphicCrop.Top

        img['cropbottom'] = portion_graphic.GraphicCrop.Bottom

        img['cropleft'] = portion_graphic.GraphicCrop.Left

        img['cropright'] = portion_graphic.GraphicCrop.Right

        img['length'] = 0

        func(img)

        return 0

    def __ParseTable(self, table, func):

        l = 0

        try:

            matrix = self.__GetTableMatrix(table)

            seps = self.__GetTableSeparators(table)

            t = {}

            count = 0

            for ri in matrix.keys():

                t[ri] = {}

                for ci in matrix[ri].keys():

                    t[ri][ci] = dict(matrix[ri][ci])

                    del t[ri][ci]['cell']

                    t[ri][ci]['content'] = []

                    l += self.__ParseText(matrix[ri][ci]['cell'], self.__Callback(t[ri][ci]['content']))

                    count += t[ri][ci]['rowspan'] * t[ri][ci]['colspan']

            if count != len(t) * len(seps):

                raise ValueError('count of cells error')

            func({'table': t, 'row': len(t), 'column': len(seps), 'length': l, 'tableid': self.table_id})

            self.table_id += 1

        except:

            l = 0

            print('discard wrong table')

        return l

    @staticmethod

    def __GetTableSeparators(table):

        result = [table.TableColumnRelativeSum]

        for ri in range(table.getRows().getCount()):

            result += [s.Position for s in table.getRows().getByIndex(ri).TableColumnSeparators]

        result = sorted(set(result))

        for i in range(len(result) - 1):

            result[i] += 1 if result[i] + 1 == result[i + 1] else 0

        return sorted(set(result))

    @staticmethod

    def __NameToRC(name):

        r = int(re.sub('[A-Za-z]', '', name)) - 1

        cstr = re.sub('[0-9]', '', name)

        c = 0

        for i in range(len(cstr)):

            if cstr[i] >= 'A' and cstr[i] <= 'Z':

                c = c * 52 + ord(cstr[i]) - ord('A')

            else:

                c = c * 52 + 26 + ord(cstr[i]) - ord('a')

        return r, c

    @staticmethod

    def __GetTableMatrix(table):

        result = {}

        for name in table.getCellNames():

            ri, ci = WordToJson.__NameToRC(name)

            cell = table.getCellByName(name)

            if ri not in result:

                result[ri] = {}

            result[ri][ci] = {'cell': cell, 'rowspan': cell.RowSpan, 'name': name}

        seps = WordToJson.__GetTableSeparators(table)

        for ri in result.keys():

            sep = [s.Position for s in table.getRows().getByIndex(ri).TableColumnSeparators] + [table.TableColumnRelativeSum]

            sep = sorted(set(sep))

            for ci in result[ri].keys():

                right = seps.index(sep[ci]) if sep[ci] in seps else seps.index(sep[ci] + 1)

                left = -1 if ci == 0 else seps.index(sep[ci - 1]) if sep[ci - 1] in seps else seps.index(sep[ci - 1] + 1)

                result[ri][ci]['colspan'] = right - left

        return result

5、写doc文档

self.doco = self.desktop.loadComponentFromURL('private:factory/swriter', '_blank', 0, ())

        self.texto = self.doco.getText()

        self.cursoro = self.texto.createTextCursor()

        self.cursoro.ParaBottomMargin = 500

def __WriteText(self, text, texto, cursoro):

        for it in text:

            if 'paragraph' in it:

                self.__WriteParagraph(it, texto, cursoro)

            elif 'image' in it:

                self.__WritePortionGraphic(it, texto, cursoro)

            elif 'table' in it:

                self.__WriteTable(it, texto, cursoro)

    def __WriteParagraph(self, paragraph, texto, cursoro):

        if paragraph['length'] > 0:

            if 'result' in paragraph:

                for it in paragraph['result']:

                    texto.insertString(cursoro, it['trans_sen'], False)

            else:

                texto.insertString(cursoro, paragraph['paragraph'], False)

            texto.insertControlCharacter(cursoro, ControlCharacter.PARAGRAPH_BREAK, False)

    def __WritePortionGraphic(self, portion_graphic, texto, cursoro):

        png_base64 = portion_graphic['image']

        png = base64.b64decode(png_base64)

        gp = self.smgr.createInstanceWithContext('com.sun.star.graphic.GraphicProvider', self.ctx)

        istream = self.smgr.createInstanceWithContext('com.sun.star.io.SequenceInputStream', self.ctx)

        istream.initialize((uno.ByteSequence(png), ))

        pv = PropertyValue()

        pv.Name = 'InputStream'

        pv.Value = istream

        actualsize = uno.createUnoStruct('com.sun.star.awt.Size')

        actualsize.Height = portion_graphic['actualheight'] if 'actualheight' in portion_graphic else portion_graphic['height']

        actualsize.Width = portion_graphic['actualwidth'] if 'actualwidth' in portion_graphic else portion_graphic['width']

        graphiccrop = uno.createUnoStruct('com.sun.star.text.GraphicCrop')

        graphiccrop.Top = portion_graphic['croptop'] if 'croptop' in portion_graphic else 0

        graphiccrop.Bottom = portion_graphic['cropbottom'] if 'cropbottom' in portion_graphic else 0

        graphiccrop.Left = portion_graphic['cropleft'] if 'cropleft' in portion_graphic else 0

        graphiccrop.Right = portion_graphic['cropright'] if 'cropright' in portion_graphic else 0

        image = self.doco.createInstance('com.sun.star.text.TextGraphicObject')

        image.Surround = NONE

        image.Graphic = gp.queryGraphic((pv, ))

        image.Height = portion_graphic['height']

        image.Width = portion_graphic['width']

        image.setPropertyValue('ActualSize', actualsize)

        image.setPropertyValue('GraphicCrop', graphiccrop)

        texto.insertTextContent(cursoro, image, False)

        texto.insertControlCharacter(cursoro, ControlCharacter.PARAGRAPH_BREAK, False)

    def __WriteTable(self, table, texto, cursoro):

        tableo = self.doco.createInstance('com.sun.star.text.TextTable')

        tableo.initialize(table['row'], table['column'])

        texto.insertTextContent(cursoro, tableo, False)

#        texto.insertControlCharacter(cursoro, ControlCharacter.PARAGRAPH_BREAK, False)

        tcursoro = tableo.createCursorByCellName("A1")

        hitbug = False

        if table['row'] > 1:

            tcursoro.goDown(1, True)

            hitbug = tcursoro.getRangeName() == 'A1'

        for ri in sorted([int(r) for r in table['table'].keys()]):

            rs = table['table'][str(ri)]

            for ci in sorted([int(c) for c in rs.keys()]):

                cell = rs[str(ci)]

                if hitbug == False and (cell['rowspan'] > 1 or cell['colspan'] > 1):

                    tcursoro.gotoCellByName(cell['name'], False)

                    if cell['rowspan'] > 1:

                        tcursoro.goDown(cell['rowspan'] - 1, True)

                    if cell['colspan'] > 1:

                        tcursoro.goRight(cell['colspan'] - 1, True)

                    tcursoro.mergeRange()

                ctexto = tableo.getCellByName(cell['name'])

                if ctexto == None:

                    continue

                ccursoro = ctexto.createTextCursor()

                ccursoro.CharWeight = FontWeight.NORMAL

                ccursoro.CharWeightAsian = FontWeight.NORMAL

                ccursoro.ParaAdjust = LEFT

                self.__WriteText(cell['content'], ctexto, ccursoro)

6、生成二进制的doc文档数据

        streamo = self.smgr.createInstanceWithContext('com.sun.star.io.Pipe', self.ctx)

        self.doco.storeToURL('private:stream', (PropertyValue('FilterName', 0, 'MS Word 2007 XML', 0), PropertyValue('OutputStream', 0, streamo, 0)))

        streamo.flush()

        _, datao = streamo.readBytes(None, streamo.available())

7、从doc文档数据生成pdf的二进制数据

        streamo = self.smgr.createInstanceWithContext('com.sun.star.io.Pipe', self.ctx)

        self.doco.storeToURL('private:stream', (PropertyValue('FilterName', 0, 'writer_pdf_Export', 0), PropertyValue('OutputStream', 0, streamo, 0)))

        streamo.flush()

        _, datap = streamo.readBytes(None, streamo.available())

8、读取excel二进制数据

　　def ImportFromMemory(self, data):

        istream = self.smgr.createInstanceWithContext('com.sun.star.io.SequenceInputStream', self.ctx)

        istream.initialize((uno.ByteSequence(data), ))

        pv = PropertyValue()

        pv.Name = 'InputStream'

        pv.Value = istream

        self.doc = {'doc': []}

        try:

            print("before loadComponentFromURL")

            self.document = self.desktop.loadComponentFromURL('private:stream/scalc', '_blank', 0, (pv, ))

            self.sheets = self.document.getSheets()

            print("ImportFromMemory done")

        except:

            print("ImportFromMemory failed")

            self.sheets = None

9、读取excel的文本数据

    def ExportToJson(self):

        try:

            l = self.__ParseText(self.sheets, self.__Callback(self.doc['doc']))

            self.doc['length'] = l

        except:

            self.doc = {'doc': [], 'length': 0}

        return json.dumps(self.doc)

    def __ParseText(self, sheets, func):

        l = 0

        sheets_it = sheets.createEnumeration()

        while sheets_it.hasMoreElements():

            element = sheets_it.nextElement()

            if element.supportsService('com.sun.star.sheet.Spreadsheet'):

                l += self.__ParseSpreadsheet(element, func)

        return l

    def __ParseSpreadsheet(self, spreadsheet, func):

        l = 0

        p = {'spreadsheet': []}

        visible_cells_it = spreadsheet.queryVisibleCells().getCells().createEnumeration()

        while visible_cells_it.hasMoreElements():

            cell = visible_cells_it.nextElement()

            type = cell.getType()

            if type == self.EMPTY:

                print("cell.type==empty")

            elif type == self.VALUE:

                print("cell.type==VALUE", "value=", cell.getValue(), cell.getCellAddress ())

            elif type == self.TEXT:

                print("cell.type==TEXT","content=", cell.getString().encode("UTF-8"), cell.getCellAddress ())

                l += self.__ParseCellText(spreadsheet, cell, self.__Callback(p['spreadsheet']))

                print("__ParseCellText=", p)

            elif type == self.FORMULA:

                print("cell.type==FORMULA", "formula=", cell.getValue())

        p['length'] = l

        func(p)

        return l

    def __ParseCellText(self, sheet, cell, func):

        try:

            x = cell.getCellAddress().Column

            y = cell.getCellAddress().Row

            sheetname = sheet.getName()

        except:

            x = -1

            y = -1

            sheetname = None

        func({'celltext': cell.getString(), 'x': x, 'y': y, 'sheetname': sheetname, 'length': len(cell.getString())})

        return len(cell.getString())

　　　　　self.EMPTY = uno.Enum("com.sun.star.table.CellContentType", "EMPTY")

        self.TEXT = uno.Enum("com.sun.star.table.CellContentType", "TEXT")

        self.FORMULA = uno.Enum("com.sun.star.table.CellContentType", "FORMULA")

        self.VALUE = uno.Enum("com.sun.star.table.CellContentType", "VALUE")

10、替换excel的文本信息

    def ImportFromJson(self, data):

        doc = json.loads(data)

        try:

            self.__WriteText(doc['doc'])

        except:

            pass

    def __WriteText(self, text):

        print("__WriteText begin:", text)

        sheet = None

        for it in text:

            if 'paragraph' in it and 'sheetname' in it:

                if sheet == None or sheet.getName() != it['sheetname']:

                    try:

                        sheet = self.sheets.getByName(it['sheetname'])

                        print("getsheet:", it['sheetname'], "=", sheet.getName())

                    except:

                        sheet = None

                        continue

                self.__WriteParagraph(it, sheet)

    def __WriteParagraph(self, paragraph, sheet):

        print("__WriteParagraph")

        if paragraph['length'] > 0:

            try:

                x = paragraph['x']

                y = paragraph['y']

                print("getcell:", x, y)

                cell = sheet.getCellByPosition(x, y)

                print("getcell done")

            except:

                return

            if 'result' in paragraph:

                for it in paragraph['result']:

                    print("cell=", cell.getString())

                    cell.setString(it['trans_sen'])

                    print("cell,", cell.getString(), ",done")

11、生成excel文档二进制数据

　　　　  streamo = self.smgr.createInstanceWithContext('com.sun.star.io.Pipe', self.ctx)

        self.document.storeToURL('private:stream', (PropertyValue('FilterName', 0, 'Calc MS Excel 2007 XML', 0), PropertyValue('OutputStream', 0, streamo, 0)))

        streamo.flush()

        _, datao = streamo.readBytes(None, streamo.available())

12、生成excel的pdf文档

        streamo = self.smgr.createInstanceWithContext('com.sun.star.io.Pipe', self.ctx)

        self.document.storeToURL('private:stream', (PropertyValue('FilterName', 0, 'calc_pdf_Export', 0), PropertyValue('OutputStream', 0, streamo, 0)))

        streamo.flush()

        _, datap = streamo.readBytes(None, streamo.available())

libreoffice python 操作word及excel文档的更多相关文章

基于DevExpress实现对PDF、Word、Excel文档的预览及操作处理
http://www.cnblogs.com/wuhuacong/p/4175266.html 在一般的管理系统模块里面,越来越多的设计到一些常用文档的上传保存操作,其中如PDF.Word.Excel ...
Aspose.Words操作word生成PDF文档
Aspose.Words操作word生成PDF文档 using Aspose.Words; using System; using System.Collections.Generic; using ...
word ppt excel文档转换成pdf
1.把word文档转换成pdf (1).添加引用 using Microsoft.Office.Interop.Word; 添加引用 (2).转换方法 /// <summary> /// ...
C#/VB.NET: 将Word或Excel文档转化为Text
Text文件只由纯文本内容组成,且没有格式,所以其大小比Word或Excel文件更小.除此之外,Text文件还具有跨平台性,几乎与所有应用程序都兼容.因此,在某些时候,我们可能需要将Word或Exce ...
Python操作Word与Excel并打包
安装模块 # Word操作库 pip install docx # Excel操作库 pip install openpyxl # 打包exe工具 pip install pyinstaller Wo ...
php 如何写入、读取word，excel文档
如何在php写入.读取word文档 <? //如何在php写入.读取word文档 // 建立一个指向新COM组件的索引 $word = new COM("word.applicatio ...
使用NOPI读取Word、Excel文档内容
使用NOPI读取Excel的例子很多,读取Word的例子不多. Excel的解析方式有多中,可以使用ODBC查询,把Excel作为一个数据集对待.也可以使用文档结构模型的方式进行解析,即解析Workb ...
Python比较两个excel文档内容的异同
#-*- coding: utf-8 -*- #比对两个Excel文件内容的差异#---------------------假设条件----------------#1.源表和目标表格式一致#2.不存 ...
在线预览word，excel文档
Google Doc 示例:https://jsfiddle.net/7xr419yb/ Microsoft Office 示例:https://jsfiddle.net/gcuzq343/

随机推荐

如何用 Python 模糊搜索文件
一.我的文件在哪里? 1.告诉计算机文件在哪使用路径描述位置绝对路径——从根目录写到底内置模块OS 路径目录文件其他系统操作 2.描述文件的特征用条件判断来筛选 3.对比后打印文件名用 ...
Python线程同步
线程执行 join与setDaemon 子线程在主线程运行结束后,会继续执行完,如果给子线程设置为守护线程(setDaemon=True),主线程运行结束子线程即结束: 如果join()线程,那么主线 ...
20165235 2018-3 《Java程序设计》第5周学习总结
20165235 2018-3 <Java程序设计>第5周学习总结教材学习内容总结第六章内部类与异常类 (一)内部类:1.java支持在一个类中定义另一个类,这个类叫内部类.2.内部 ...
maya cmds pymel polyEvaluate 获取 bounding box
maya cmds pymel polyEvaluate 获取 bounding box cmds.polyEvaluate(bc = 1) #模型 cmds.polyEvaluate(bc2 = ...
用 threading 写多线程服务器
import socket import threading server = socket.socket() server.bind(("127.0.0.1",8899)) se ...
android studio打可执行jar包
android studio可以通过library工程打出jar包解压会看到META-INF/MANIFEST.MF文件的打开如下: Manifest-Version: 1.0 增加一行,注意冒号后 ...
Codeforces 1036E Covered Points (线段覆盖的整点数)【计算几何】
<题目链接> <转载于 >>> > 题目大意: 在二维平面上给出n条不共线的线段(线段端点是整数),问这些线段总共覆盖到了多少个整数点. 解题分析: 用GC ...
Linux安装Elasticsearch
本文介绍Linux环境如何安装Elasticsearch. 本文环境是在腾讯云服务器CentOS7.2搭建的,JDK1.8,elasticsearch-5.4.2. 1 安装JDK 网上教程很多,也可 ...
Alpha(6/10)
鐵鍋燉腯鱻项目:小鱼记账团队成员项目燃尽图冲刺情况描述站立式会议照片各成员情况团队成员学号姓名 git地址博客地址 031602240 许郁杨 (组长) https://githu ...
页面嵌入iframe那些事儿
一.用iframe如何把别人的页面嵌入自己的页面? <iframe src="http://blog.sina.com.cn/abc" frameBorder=0 scrol ...

libreoffice python 操作word及excel文档

libreoffice python 操作word及excel文档的更多相关文章

随机推荐

热门专题