1、引入maven依赖

        <dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.4</version>
</dependency>

2、相关工具类:PdfParser.java

package com.insurance.tool;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
import org.apache.pdfbox.text.PDFTextStripper; import com.insurance.pojo.Insurance;
import com.insurance.pojo.InsuranceOrder;
import com.insurance.pojo.InsuranceProgram; public class PdfParser { public static void main(String[] args) {
readPDF("C:\\Users\\yinz\\Desktop\\场景1\\场景1_样例_电子保单识别.pdf");
} public static List<InsuranceOrder> readPDF(InputStream stream) throws Exception{
List<InsuranceOrder> orderList = new ArrayList<InsuranceOrder>();
PDDocument document = null;
document=PDDocument.load(stream); // 获取页码
int pages = document.getNumberOfPages(); // 读文本内容
PDFTextStripper stripper=new PDFTextStripper();
// 设置按顺序输出
stripper.setSortByPosition(true);
/*stripper.setStartPage(1);
stripper.setEndPage(pages);
String content = stripper.getText(document);
System.out.println(content);*/ for(int page = 1; page <= pages; page++) {
stripper.setStartPage(page);
stripper.setEndPage(page);
String content = stripper.getText(document);
//System.out.println(content);
parseContent(content, orderList);
} System.out.println(orderList);
return orderList;
} public static void readPDF(String filePath) {
List<InsuranceOrder> orderList = new ArrayList<InsuranceOrder>();
File pdfFile = new File(filePath);
PDDocument document = null;
try
{
document=PDDocument.load(pdfFile); // 获取页码
int pages = document.getNumberOfPages(); // 读文本内容
PDFTextStripper stripper=new PDFTextStripper();
// 设置按顺序输出
stripper.setSortByPosition(true);
/*stripper.setStartPage(1);
stripper.setEndPage(pages);
String content = stripper.getText(document);
System.out.println(content);*/ for(int page = 1; page <= pages; page++) {
stripper.setStartPage(page);
stripper.setEndPage(page);
String content = stripper.getText(document);
//System.out.println(content);
parseContent(content, orderList);
}
System.out.println(orderList);
}
catch(Exception e)
{
System.out.println(e);
} } private static Pattern insurancePoliceNoP = Pattern.compile("保险单号\\s(.*?)\\s");
private static Pattern insuranceApplicationNoP = Pattern.compile("投保单号\\s(.*?)\\s");
private static Pattern policeHolderP = Pattern.compile("投 保 人.*\r\n");
private static Pattern insuredP = Pattern.compile("被保险人.*\r\n");
private static Pattern insuredAgeP = Pattern.compile("被保险人投保年龄\\s(.*?)(\r\n|\\s)");
private static Pattern beneficiaryP = Pattern.compile("身故受益人及分配方式\\s(.*?)(\r\n|\\s)");
private static Pattern insuranceNameP = Pattern.compile("险种名称及款式\\s(.*?)(\r\n|\\s)");
private static Pattern validPeriodP = Pattern.compile("保险期间\\s(.*?)\\s合同生效日", Pattern.DOTALL);
private static Pattern effectiveDateP = Pattern.compile("合同生效日\\s(.*?)(\r\n|\\s)");
private static Pattern chargeWayP = Pattern.compile("交费方式\\s(.*?)\\s");
private static Pattern feeP = Pattern.compile("保 险 费\\s(.*?)(\r\n|\\s)");
private static Pattern policeHolderCount = Pattern.compile("投保份数\\s(.*?)(\r\n|\\s)");
private static Pattern programListP = Pattern.compile("保险金额(.*?)保险责任与责任免除详见条款", Pattern.DOTALL);
/*private static Pattern validPeriodP = Pattern.compile("保险期间\\s(.*?)\\s");
private static Pattern effectiveDateP = Pattern.compile("合同生效日\\s(.*?)\\s");*/
private static void parseContent(String content, List<InsuranceOrder> list) {
if(content == null || content.trim().length() == 0) {
return;
}
if(content.startsWith("个 人 人 身 保 险 保 险 单")) {
//个人信息
InsuranceOrder order = new InsuranceOrder();
String insurancePoliceNo = retriveText(content, insurancePoliceNoP, 1);
if(insurancePoliceNo == null || insurancePoliceNo.length() <= 0) {
return;
}
list.add(order);
order.setInsurancePoliceNo(insurancePoliceNo);
order.setInsuranceApplicationNo(retriveText(content, insuranceApplicationNoP, 1)); String policeHolderInfo = retriveTextWithInnnerBlank(content, policeHolderP, 0);
if(policeHolderInfo != null) {
Pattern policeHolderNameP = Pattern.compile("投 保 人(.*?)性别");
Pattern policeHolderGenderP = Pattern.compile("性别(.*?)出生日期");
Pattern policeHolderBirthdayP = Pattern.compile("出生日期(.*?)证件号码");
Pattern policeHolderIDP = Pattern.compile("证件号码(.*)$"); order.setPoliceHolderName(retriveText(policeHolderInfo, policeHolderNameP, 1));
order.setPoliceHolderGender(retriveText(policeHolderInfo, policeHolderGenderP, 1));
order.setPoliceHolderBirthday(retriveText(policeHolderInfo, policeHolderBirthdayP, 1));
order.setPoliceHolderID(retriveText(policeHolderInfo, policeHolderIDP, 1));
}
String insuredInfo = retriveTextWithInnnerBlank(content, insuredP, 0);
if(insuredInfo != null) {
Pattern insuredNameP = Pattern.compile("被保险人(.*?)性别");
Pattern insuredGenderP = Pattern.compile("性别(.*?)出生日期");
Pattern insuredBirthdayP = Pattern.compile("出生日期(.*?)证件号码");
Pattern insuredIDP = Pattern.compile("证件号码(.*)$"); order.setInsuredName(retriveText(insuredInfo, insuredNameP, 1));
order.setInsuredGender(retriveText(insuredInfo, insuredGenderP, 1));
order.setInsuredBirthday(retriveText(insuredInfo, insuredBirthdayP, 1));
order.setInsuredID(retriveText(insuredInfo, insuredIDP, 1));
}
order.setInsuredAge(retriveText(content, insuredAgeP, 1));
order.setBeneficiary(retriveText(content, beneficiaryP, 1)); //保险信息
Insurance insurance = new Insurance();
order.setInsurance(insurance);
insurance.setName(retriveText(content, insuranceNameP, 1));
insurance.setValidPeriod(retriveText(content, validPeriodP, 1).replaceAll("\r\n", ""));
insurance.setEffectiveDate(retriveText(content, effectiveDateP, 1));
insurance.setChargeWay(retriveText(content, chargeWayP, 1));
insurance.setFee(retriveText(content, feeP, 1));
insurance.setPoliceHolderCount(retriveText(content, policeHolderCount, 1)); //保险项目信息
String programList = retriveTextWithInnnerBlank(content, programListP, 1);
if(programList != null) {
String[] pArr = programList.split("\r\n");
for(String str : pArr) {
if(str != null && str.trim().length() > 0) {
String[] subArr = str.split(" ");
InsuranceProgram program = new InsuranceProgram();
order.getProgramList().add(program);
program.setName(subArr[0]);
program.setFee(subArr[1]);
}
}
}
}
} private static String retriveText(String content, Pattern p, int position) {
Matcher m = p.matcher(content);
if(m.find()) {
return m.group(position).trim().replace(" ", "");
}
return "";
} private static String retriveTextWithInnnerBlank(String content, Pattern p, int position) {
Matcher m = p.matcher(content);
if(m.find()) {
return m.group(position).trim();
}
return "";
}
}

相关实体类:InsuranceOrder .java

package com.insurance.pojo;

import java.util.ArrayList;
import java.util.List; public class InsuranceOrder { private String insurancePoliceNo; //保险单号
private String insuranceApplicationNo; //投保单号
private String policeHolderName; // 投保人
private String policeHolderBirthday; //投保人出生日期
private String policeHolderGender; //投保人性别
private String policeHolderID; // 投保人证件号码
private String insuredName; //被保险人
private String insuredGender; //被保险人性别
private String insuredBirthday; //被保险人出生日期
private String insuredID; //被保险人证件号
private String insuredAge; //被保险人投保年龄
private String beneficiary; //身故受益人及分配方式 private Insurance insurance; //险种
private List<InsuranceProgram> programList = new ArrayList<InsuranceProgram>(); //保险项目 public String getPoliceHolderBirthday() {
return policeHolderBirthday;
}
public void setPoliceHolderBirthday(String policeHolderBirthday) {
this.policeHolderBirthday = policeHolderBirthday;
}
public String getInsuredBirthday() {
return insuredBirthday;
}
public void setInsuredBirthday(String insuredBirthday) {
this.insuredBirthday = insuredBirthday;
}
public String getInsurancePoliceNo() {
return insurancePoliceNo;
}
public void setInsurancePoliceNo(String insurancePoliceNo) {
this.insurancePoliceNo = insurancePoliceNo;
}
public String getInsuranceApplicationNo() {
return insuranceApplicationNo;
}
public void setInsuranceApplicationNo(String insuranceApplicationNo) {
this.insuranceApplicationNo = insuranceApplicationNo;
}
public String getPoliceHolderName() {
return policeHolderName;
}
public void setPoliceHolderName(String policeHolderName) {
this.policeHolderName = policeHolderName;
}
public String getPoliceHolderGender() {
return policeHolderGender;
}
public void setPoliceHolderGender(String policeHolderGender) {
this.policeHolderGender = policeHolderGender;
}
public String getPoliceHolderID() {
return policeHolderID;
}
public void setPoliceHolderID(String policeHolderID) {
this.policeHolderID = policeHolderID;
}
public String getInsuredName() {
return insuredName;
}
public void setInsuredName(String insuredName) {
this.insuredName = insuredName;
}
public String getInsuredGender() {
return insuredGender;
}
public void setInsuredGender(String insuredGender) {
this.insuredGender = insuredGender;
}
public String getInsuredID() {
return insuredID;
}
public void setInsuredID(String insuredID) {
this.insuredID = insuredID;
}
public String getInsuredAge() {
return insuredAge;
}
public void setInsuredAge(String insuredAge) {
this.insuredAge = insuredAge;
}
public String getBeneficiary() {
return beneficiary;
}
public void setBeneficiary(String beneficiary) {
this.beneficiary = beneficiary;
}
public Insurance getInsurance() {
return insurance;
}
public void setInsurance(Insurance insurance) {
this.insurance = insurance;
}
public List<InsuranceProgram> getProgramList() {
return programList;
}
public void setProgramList(List<InsuranceProgram> programList) {
this.programList = programList;
}
@Override
public String toString() {
return "InsuranceOrder [insurancePoliceNo=" + insurancePoliceNo
+ ", insuranceApplicationNo=" + insuranceApplicationNo
+ ", policeHolderName=" + policeHolderName
+ ", policeHolderBirthday=" + policeHolderBirthday
+ ", policeHolderGender=" + policeHolderGender
+ ", policeHolderID=" + policeHolderID + ", insuredName="
+ insuredName + ", insuredGender=" + insuredGender
+ ", insuredBirthday=" + insuredBirthday + ", insuredID="
+ insuredID + ", insuredAge=" + insuredAge + ", beneficiary="
+ beneficiary + ", insurance=" + insurance + ", programList="
+ programList + "]";
} }

InsuranceProgram.java

package com.insurance.pojo;

/**
* 保险项目
* @author yinz
*
*/
public class InsuranceProgram { private String name; //项目名称
private String fee; //金额
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getFee() {
return fee;
}
public void setFee(String fee) {
this.fee = fee;
}
@Override
public String toString() {
return "InsuranceProgram [name=" + name + ", fee=" + fee + "]";
} }

此处用于读取的pdf文件:http://files.cnblogs.com/files/yinz/场景1_样例_电子保单识别.rar

pdfBox 读取pdf文件的更多相关文章

  1. java 用PDFBox 删除 PDF文件中的某一页

    依赖: <dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>pdfbox-app ...

  2. 深入学习python解析并读取PDF文件内容的方法

    这篇文章主要学习了python解析并读取PDF文件内容的方法,包括对学习库的应用,python2.7和python3.6中python解析PDF文件内容库的更新,包括对pdfminer库的详细解释和应 ...

  3. Java 使用PDFBox提取PDF文件中的图片

    今天做PDF文件解析,遇到一个需求:提取文件中的图片并保存.使用的是流行的apache开源jar包pdfbox, 但还是遇到坑了,比如pdfbox版本太高或太低都不能用!!这个包竟然没有很好地做好兼容 ...

  4. pdf.js如何跨域读取pdf文件?

    今天,上线一个客户网站之后(使用的是广州新一代虚拟空间)发现在读取上传的pdf文件的时候读取错误,通过直接在浏览器输入文件地址的时候发现文件地址被重定向了(呵呵!),结果就是pdf文件源由本地直接变成 ...

  5. python3用pdfminer3k在线读取pdf文件

    import importlib import sys import random from urllib.request import urlopen from urllib.request imp ...

  6. java 库 pdfbox 将 pdf 文件转换成高清图片方法

    近期需要将 pdf 文件转成高清图片,使用库是 pdfbox.fontbox.可以使用 renderImageWithDPI 方法指定转换的清晰度,当然清晰度越高,转换需要的时间越长,转换出来的图片越 ...

  7. 读取pdf文件 .选择了itextsharp 库

    此库还是比较成熟.看博客园很多文章都介绍了此库 用法 如果项目用到读取pdf.  我这只是提供个思路.或者提供个方法.用itextsharp 能方便实现 StringBuilder text = ne ...

  8. 记一次为解决Python读取PDF文件的Shell操作

    目录 一.背景 二.问题 三.解决 四.一顿分析及 Shell 操作 五.后续 一.背景 本想将 PDF 文件转换为 Word 文档,然后网上搜索了一下发现有挺多转换的软件.有的是免费的.收费,咱也不 ...

  9. C# PDFBox 解析PDF文件

    下载 PDFBox-0.7.3.zip PDFBox-0.7.3.dlllucene-demos-2.0.0.dlllucene-core-2.0.0.dllbcmail-jdk14-132.dllb ...

随机推荐

  1. 【R笔记】R的内存管理和垃圾清理

    笔记: 1.R输入命令时速度不要太快,终究是个统计软件,不是编程! 2.memory.limit()查看当前操作系统分配内存给R的最大限度(单位是M?) 3.要经常 rm(object) 或者 rm( ...

  2. 上传--下载HDFS文件并指定文件物理块的大小

    使用hdfs的api接口分别实现从本地上传文件到集群和从集群下载文件到本地. 1)上传文件主要是使用FileSystem类的copyFromLocalFile()方法来实现,另外我们上传文件时可以指定 ...

  3. FTTB FTTC FTTH FTTO FSA

    FTTB Fiber to The Building 光纤到楼 FTTC Fiber to The Curb 光纤到路边 FTTH Fiber to The Home 光纤到家 FTTO Fiber ...

  4. 看懂ios命名规则

    http://liangrui.blog.51cto.com/1510945/509289/ http://daniellee520.blog.51cto.com/372529/229615

  5. 【tomcat】FileNotFoundException: C:\Program Files\Java\apache-tomcat-8.5.11-geneshop3\webapps\ROOT\index.html (拒绝访问。)

    新装系统后,tomcat启动起来 提示如下错误: Caused by: java.io.FileNotFoundException: C:\Program Files\Java\apache-tomc ...

  6. css:滑动门

    <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/ ...

  7. WebService authentication

    http://blog.csdn.net/largestone_187/article/details/5734632 通过SoapHeader对用户口令进行验证,只有授权的用户才可以使用接口.确保了 ...

  8. css自动换行与不换行

    1.自动换行 div{ word-wrap: break-word; word-break: normal; } 2.不换行 div{ white-space:nowrap; } 3.浮动效果不换行 ...

  9. git如何打补丁?

    git cherry-pick 可以把某个分支的某几次提交合入到当前分支,只是在一台设备上操作. git format-patch 可以把某个分支的n次提交分别打成n个补丁,然后把这些补丁文件(比如0 ...

  10. iOS :学习新技术途径和sizeClasses屏幕适配

    1.了解有什么新技术 1> 苹果API文档 - General - Guides - iOSx API Diffs 2> 观看WWDC会议视频 2.如何使用新技术 1> 自己根据AP ...