java 解析office文件大全

原文地址：http://ansjsun.iteye.com/blog/791142

读取OFFICE文件纯文本

package org.css.resource.businesssoft.searchengine.quwenjiansuo;

import java.io.File;

import java.io.FileInputStream;

import java.io.IOException;

import java.io.InputStream;

import org.apache.poi.POITextExtractor;

import org.apache.poi.POIXMLDocument;

import org.apache.poi.POIXMLTextExtractor;

import org.apache.poi.extractor.ExtractorFactory;

import org.apache.poi.hssf.usermodel.HSSFCell;

import org.apache.poi.hssf.usermodel.HSSFRow;

import org.apache.poi.hssf.usermodel.HSSFSheet;

import org.apache.poi.hssf.usermodel.HSSFWorkbook;

import org.apache.poi.hwpf.extractor.WordExtractor;

import org.apache.poi.openxml4j.exceptions.OpenXML4JException;

import org.apache.poi.openxml4j.opc.OPCPackage;

import org.apache.poi.xssf.usermodel.XSSFCell;

import org.apache.poi.xssf.usermodel.XSSFRow;

import org.apache.poi.xssf.usermodel.XSSFSheet;

import org.apache.poi.xssf.usermodel.XSSFWorkbook;

import org.apache.poi.xwpf.extractor.XWPFWordExtractor;

import org.apache.xmlbeans.XmlException;

/**

 *

 * @author lizh

 *

 */

public class CovertFile {

    /**

     * 从word 2003文档中提取纯文本

     * @param is

     * @return

     * @throws IOException

     */

    public static String extractTextFromDOC(InputStream is) throws IOException {

        WordExtractor ex = new WordExtractor(is); // is是WORD文件的InputStream

        return ex.getText();

    }

    /**

     * 从word 2007文档中提取纯文本

     * @param fileName

     * @return

     */

    public static String extractTextFromDOC2007(String fileName) {

        try {

            OPCPackage opcPackage = POIXMLDocument.openPackage(fileName);

            POIXMLTextExtractor ex = new XWPFWordExtractor(opcPackage);

            return ex.getText();

        } catch (Exception e) {

            return "";

        }

    }

    /**

     * 从excel 2003文档中提取纯文本

     * @param is

     * @return

     * @throws IOException

     */

    private static String extractTextFromXLS(InputStream is) throws IOException {

        StringBuffer content = new StringBuffer();

        HSSFWorkbook workbook = new HSSFWorkbook(is); // 创建对Excel工作簿文件的引用

        for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {

            if (null != workbook.getSheetAt(numSheets)) {

                HSSFSheet aSheet = workbook.getSheetAt(numSheets); // 获得一个sheet

                for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet

                        .getLastRowNum(); rowNumOfSheet++) {

                    if (null != aSheet.getRow(rowNumOfSheet)) {

                        HSSFRow aRow = aSheet.getRow(rowNumOfSheet); // 获得一行

                        for (short cellNumOfRow = 0; cellNumOfRow <= aRow

                                .getLastCellNum(); cellNumOfRow++) {

                            if (null != aRow.getCell(cellNumOfRow)) {

                                HSSFCell aCell = aRow.getCell(cellNumOfRow); // 获得列值

                                if (aCell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {

                                    content.append(aCell.getNumericCellValue());

                                } else if (aCell.getCellType() == HSSFCell.CELL_TYPE_BOOLEAN) {

                                    content.append(aCell.getBooleanCellValue());

                                } else {

                                    content.append(aCell.getStringCellValue());

                                }

                            }

                        }

                    }

                }

            }

        }

        return content.toString();

    }

    /**

     * 从excel 2007文档中提取纯文本

     * @param fileName

     * @return

     * @throws Exception

     */

    private static String extractTextFromXLS2007(String fileName)

            throws Exception {

        StringBuffer content = new StringBuffer();

        // 构造 XSSFWorkbook 对象，strPath 传入文件路径

        XSSFWorkbook xwb = new XSSFWorkbook(fileName);

        // 循环工作表Sheet

        for (int numSheet = 0; numSheet < xwb.getNumberOfSheets(); numSheet++) {

            XSSFSheet xSheet = xwb.getSheetAt(numSheet);

            if (xSheet == null) {

                continue;

            }

            // 循环行Row

            for (int rowNum = 0; rowNum <= xSheet.getLastRowNum(); rowNum++) {

                XSSFRow xRow = xSheet.getRow(rowNum);

                if (xRow == null) {

                    continue;

                }

                // 循环列Cell

                for (int cellNum = 0; cellNum <= xRow.getLastCellNum(); cellNum++) {

                    XSSFCell xCell = xRow.getCell(cellNum);

                    if (xCell == null) {

                        continue;

                    }

                    if (xCell.getCellType() == XSSFCell.CELL_TYPE_BOOLEAN) {

                        content.append(xCell.getBooleanCellValue());

                    } else if (xCell.getCellType() == XSSFCell.CELL_TYPE_NUMERIC) {

                        content.append(xCell.getNumericCellValue());

                    } else {

                        content.append(xCell.getStringCellValue());

                    }

                }

            }

        }

        return content.toString();

    }

    /**

     * 从excel 2007文档中提取纯文本

     * @param fileName

     * @return

     */

    public static String getXLS2007(String fileName){

        String doc = "";

        try{

            doc = extractTextFromXLS2007(fileName);

            return doc;

        }catch(Exception e){

            return "";

        }

    }

    /**

     * 从ppt 2003、2007文档中提取纯文本

     * @param fileName

     * @return

     */

    public static String getPPTX(String fileName){

        String doc = "";

        try{

            File inputFile = new File(fileName);

            POITextExtractor extractor = ExtractorFactory.createExtractor(inputFile);

            doc = extractor.getText();

            return doc;

        }catch(Exception e){

            return "";

        }

    }

    public static void main(String[] args) {

        try {

//            String wordFile = "D:/松山血战.docx";

//            String wordText2007 = CovertFile.extractTextFromDOC2007(wordFile);

//            System.out.println("wordText2007=======" + wordText2007);

//

//            InputStream is = new FileInputStream("D:/XXX研发中心技术岗位职位需求.xls");

//            String excelText = CovertFile.extractTextFromXLS(is);

//            System.out.println("text2003==========" + excelText);

//            String excelFile = "D:/zh.xlsx";

//            String excelText2007 = CovertFile.extractTextFromXLS2007(excelFile);

//            System.out.println("excelText2007==========" + excelText2007);

            String pptFile = "D:/zz3.ppt";

            String pptx = CovertFile.getPPTX(pptFile);

            System.out.println("pptx==========" + pptx);

        } catch (Exception e) {

            e.printStackTrace();

        }

    }

}

最后突然发现其实只用两行代码就能搞定
office 2003 - office 2007

POITextExtractor extractor = ExtractorFactory.createExtractor(f);

            return extractor.getText();

于是我泪流满面....白忙乎了..顺路奉上解析pdf的吧

package com.lingjoin.extractors;

import java.io.BufferedReader;

import java.io.FileNotFoundException;

import java.io.IOException;

import java.io.StringReader;

import java.util.Date;

import org.apache.pdfbox.pdmodel.PDDocument;

import org.apache.pdfbox.pdmodel.PDDocumentInformation;

import org.apache.pdfbox.util.PDFTextStripper;

import com.lingjoin.paser.LingJoinFile;

/**

 * PDF解析器

 *

 * @author Ansj

 *

 */

public class PDFExtractor extends AbstractExtractor {

    private String getContent(LingJoinFile f) {

        // TODO Auto-generated method stub

        PDDocument doc = null ;

        try {

            doc = PDDocument.load(f);

            PDFTextStripper stripper = new PDFTextStripper();

            /**

             * 设置文件的信息

             */

            this.setLingJoinFileInfo(f, doc

                    .getDocumentInformation());

            return stripper.getText(doc);

        } catch (FileNotFoundException e) {

            // TODO Auto-generated catch block

            e.printStackTrace();

        } catch (IOException e) {

            // TODO Auto-generated catch block

            e.printStackTrace();

        } finally {

            if (doc != null) {

                try {

                    doc.close();

                } catch (IOException e) {

                    // TODO Auto-generated catch block

                    e.printStackTrace();

                }

            }

        }

        return "";

    }

    private BufferedReader getContentReader(LingJoinFile f) {

        return new BufferedReader(new StringReader(this.getContent(f)));

    }

    /**

    *

    * 项目名称：FilePaser

    * 类描述：   设置文件的信息

    * 创建人：ANSJ

    * 创建时间：2010-4-14 下午04:27:57

    * 修改备注：

    * @version

     */

    private void setLingJoinFileInfo(LingJoinFile f, PDDocumentInformation info) {

        if (info.getAuthor() != null) {

            f.setlAuthor(info.getAuthor());

        }

//        try {

//            if (info.getModificationDate() != null) {

//                Date date = info.getModificationDate().getTime();

//                f.setlModificationDate(date.getTime());

//            }

//        } catch (IOException e) {

//            // TODO Auto-generated catch block

//            e.printStackTrace();

//        }

        //设置标题

//        if (info.getTitle() != null) {

//            f.setlTitle(info.getTitle());

//        }

    }

    public void paserFileToReader(LingJoinFile f) throws Exception {

        f.setlContentReader(this.getContentReader(f)) ;

    }

    public void paserFileToString(LingJoinFile f) throws Exception {

        // TODO Auto-generated method stub

        f.setlContent(this.getContent(f)) ;

    }

    public PDFExtractor(Integer typeFlag) {

        // TODO Auto-generated constructor stub

        this.typeFlag = typeFlag ;

    }

    private Integer typeFlag = null ;

    public Integer getTypeFlag() {

        // TODO Auto-generated method stub

        return typeFlag;

    }

}

java 解析office文件大全的更多相关文章

Java解析OFFICE(word,excel,powerpoint)以及PDF的实现方案及开发中的点滴分享
Java解析OFFICE(word,excel,powerpoint)以及PDF的实现方案及开发中的点滴分享在此,先分享下写此文前的经历与感受,我所有的感觉浓缩到一个字,那就是:"坑&qu ...
java解析xml文件并输出
使用java解析xml文件,通过dom4j,代码运行前需先导入dom4j架包. ParseXml类代码如下: import java.io.File; import java.util.ArrayLi ...
使用Java解析XML文件或XML字符串的例子
转: 使用Java解析XML文件或XML字符串的例子 2017年09月16日 11:36:18 inter_peng 阅读数:4561 标签: JavaXML-Parserdom4j 更多个人分类: ...
Java解析JSON文件的方法
http://blog.sina.com.cn/s/blog_628cc2b70101dydc.html java读取文件的方法 http://www.cnblogs.com/lovebread/ar ...
Java解析xml文件遇到特殊符号&会出现异常的解决方案
文/朱季谦在一次Java解析xml文件的开发过程中,使用SAX解析时,出现了这样一个异常信息: Error on line 60 of document : 对实体 "xxx" ...
java解析XML文件
dom4j是一个Java的XML API,类似于jdom,用来读写XML文件的.dom4j是一个非常非常优秀的Java XML API,具有性能优异.功能强大和极端易用使用的特点,同时它也是一个开放源 ...
java解析properties文件
在自动化测试过程中,经常会有一些公用的属性要配置,以便后面给脚本使用,我们可以选择xml, excel或者json格式来存贮这些数据,但其实java本身就提供了properties类来处理proper ...
JAVA解析XML文件(DOM,SAX,JDOM,DOM4j附代码实现)
1.解析XML主要有四种方式 1.DOM方式解析XML(与平台无关,JAVA提供,一次性加载XML文件内容,形成树结构,不适用于大文件) 2.SAX方式解析XML(基于事件驱动,逐条解析,适用于只处理 ...
java实现office文件预览
不知觉就过了这个久了,继上篇java实现文件上传下载后,今天给大家分享一篇java实现的对office文件预览功能. 相信大家在平常的项目中会遇到需要对文件实现预览功能,这里不用下载节省很多事.大家请 ...

随机推荐

根据id来实现小程序tab切换，
本例根据绑定id来实现tab切换,但本例仍有缺陷,用for循环数据,无法实现切换.如有大神能够有更好方法,欢迎留言更正 WXML: <view class="tab"> ...
Prism 4 文档 ---第11章部署Prism应用程序
要成功移动Prism应用到生产中,需要对部署计划为应用程序的设计过程的一部分.本章介绍了注意事项和你需要采取的准备以部署应用程序,以及你要在用户手中获得部署程序所需要采取的行动. Si ...
UAC 注册表 WIN64 OS 运行时主题
首先EXE程序是32位,DelphiIDE对Project默认是启用主题的,默认情况在WIN64 OS下运行时,无管理员权限. WIN64 OS,默认情况下UAC是启用的. 上述默认情况下,EXE 是 ...
maven 构建war包时排除web.xml
在使用maven构建项目的war包时,有时并不需要src/webapp/WEB-INF/下的一些文件. 这时可以通过maven-war-plugin创建配置来排除这些文件.下面贴出我平时使用的pom. ...
使用MyEclipse开发Java EE应用：企业级应用程序项目（下）
你开学,我放价!MyEclipse线上狂欢继续!火热开启中>> [MyEclipse最新版下载] 二.项目组织.依赖性和类解析 JEE规范为企业应用程序定义了一个分层的Java类解决策略, ...
基于资源名的MVC权限控制
在程序复杂程度不断上升的过程中,无可避免需要触碰到权限控制,而权限控制又与业务逻辑紧紧相关,市场上出现了大量的权限控制产品,而程序的开发,讲究去繁化简的抽象,在我的开发过程中,逐渐发现程序的权限控制核 ...
easyui常用属性
属性分为CSS片段和JS片段. CSS类定义:1.div easyui-window 生成一个window窗口样式. 属性如下: 1)mod ...
L213
The world lost seven astronauts of Space Shuttle Columbia(哥伦比亚号航天飞机) this month. It broughthome the ...
python3：jsonpath-rw处理Json对象
前提:接口自动化测试中,存在依赖情况:test_02的某个请求参数的值,需要依赖test_01返回结果中某个字段的数据,所以就先需要拿到返回数据中特定字段的值.这里使用到python中jsonpath ...
几个你所不知道的技巧助你写出更优雅的vue.js代码
1. watch 与 computed 的巧妙结合如上图,一个简单的列表页面. 你可能会这么做: created(){ this.fetchData() }, watch: { keyword(){ ...

java 解析office文件 大全

java 解析office文件 大全的更多相关文章

随机推荐

热门专题

java 解析office文件大全

java 解析office文件大全的更多相关文章