1、引入maven依赖

        <dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.4</version>
</dependency>

2、相关工具类:PdfParser.java

package com.insurance.tool;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
import org.apache.pdfbox.text.PDFTextStripper; import com.insurance.pojo.Insurance;
import com.insurance.pojo.InsuranceOrder;
import com.insurance.pojo.InsuranceProgram; public class PdfParser { public static void main(String[] args) {
readPDF("C:\\Users\\yinz\\Desktop\\场景1\\场景1_样例_电子保单识别.pdf");
} public static List<InsuranceOrder> readPDF(InputStream stream) throws Exception{
List<InsuranceOrder> orderList = new ArrayList<InsuranceOrder>();
PDDocument document = null;
document=PDDocument.load(stream); // 获取页码
int pages = document.getNumberOfPages(); // 读文本内容
PDFTextStripper stripper=new PDFTextStripper();
// 设置按顺序输出
stripper.setSortByPosition(true);
/*stripper.setStartPage(1);
stripper.setEndPage(pages);
String content = stripper.getText(document);
System.out.println(content);*/ for(int page = 1; page <= pages; page++) {
stripper.setStartPage(page);
stripper.setEndPage(page);
String content = stripper.getText(document);
//System.out.println(content);
parseContent(content, orderList);
} System.out.println(orderList);
return orderList;
} public static void readPDF(String filePath) {
List<InsuranceOrder> orderList = new ArrayList<InsuranceOrder>();
File pdfFile = new File(filePath);
PDDocument document = null;
try
{
document=PDDocument.load(pdfFile); // 获取页码
int pages = document.getNumberOfPages(); // 读文本内容
PDFTextStripper stripper=new PDFTextStripper();
// 设置按顺序输出
stripper.setSortByPosition(true);
/*stripper.setStartPage(1);
stripper.setEndPage(pages);
String content = stripper.getText(document);
System.out.println(content);*/ for(int page = 1; page <= pages; page++) {
stripper.setStartPage(page);
stripper.setEndPage(page);
String content = stripper.getText(document);
//System.out.println(content);
parseContent(content, orderList);
}
System.out.println(orderList);
}
catch(Exception e)
{
System.out.println(e);
} } private static Pattern insurancePoliceNoP = Pattern.compile("保险单号\\s(.*?)\\s");
private static Pattern insuranceApplicationNoP = Pattern.compile("投保单号\\s(.*?)\\s");
private static Pattern policeHolderP = Pattern.compile("投 保 人.*\r\n");
private static Pattern insuredP = Pattern.compile("被保险人.*\r\n");
private static Pattern insuredAgeP = Pattern.compile("被保险人投保年龄\\s(.*?)(\r\n|\\s)");
private static Pattern beneficiaryP = Pattern.compile("身故受益人及分配方式\\s(.*?)(\r\n|\\s)");
private static Pattern insuranceNameP = Pattern.compile("险种名称及款式\\s(.*?)(\r\n|\\s)");
private static Pattern validPeriodP = Pattern.compile("保险期间\\s(.*?)\\s合同生效日", Pattern.DOTALL);
private static Pattern effectiveDateP = Pattern.compile("合同生效日\\s(.*?)(\r\n|\\s)");
private static Pattern chargeWayP = Pattern.compile("交费方式\\s(.*?)\\s");
private static Pattern feeP = Pattern.compile("保 险 费\\s(.*?)(\r\n|\\s)");
private static Pattern policeHolderCount = Pattern.compile("投保份数\\s(.*?)(\r\n|\\s)");
private static Pattern programListP = Pattern.compile("保险金额(.*?)保险责任与责任免除详见条款", Pattern.DOTALL);
/*private static Pattern validPeriodP = Pattern.compile("保险期间\\s(.*?)\\s");
private static Pattern effectiveDateP = Pattern.compile("合同生效日\\s(.*?)\\s");*/
private static void parseContent(String content, List<InsuranceOrder> list) {
if(content == null || content.trim().length() == 0) {
return;
}
if(content.startsWith("个 人 人 身 保 险 保 险 单")) {
//个人信息
InsuranceOrder order = new InsuranceOrder();
String insurancePoliceNo = retriveText(content, insurancePoliceNoP, 1);
if(insurancePoliceNo == null || insurancePoliceNo.length() <= 0) {
return;
}
list.add(order);
order.setInsurancePoliceNo(insurancePoliceNo);
order.setInsuranceApplicationNo(retriveText(content, insuranceApplicationNoP, 1)); String policeHolderInfo = retriveTextWithInnnerBlank(content, policeHolderP, 0);
if(policeHolderInfo != null) {
Pattern policeHolderNameP = Pattern.compile("投 保 人(.*?)性别");
Pattern policeHolderGenderP = Pattern.compile("性别(.*?)出生日期");
Pattern policeHolderBirthdayP = Pattern.compile("出生日期(.*?)证件号码");
Pattern policeHolderIDP = Pattern.compile("证件号码(.*)$"); order.setPoliceHolderName(retriveText(policeHolderInfo, policeHolderNameP, 1));
order.setPoliceHolderGender(retriveText(policeHolderInfo, policeHolderGenderP, 1));
order.setPoliceHolderBirthday(retriveText(policeHolderInfo, policeHolderBirthdayP, 1));
order.setPoliceHolderID(retriveText(policeHolderInfo, policeHolderIDP, 1));
}
String insuredInfo = retriveTextWithInnnerBlank(content, insuredP, 0);
if(insuredInfo != null) {
Pattern insuredNameP = Pattern.compile("被保险人(.*?)性别");
Pattern insuredGenderP = Pattern.compile("性别(.*?)出生日期");
Pattern insuredBirthdayP = Pattern.compile("出生日期(.*?)证件号码");
Pattern insuredIDP = Pattern.compile("证件号码(.*)$"); order.setInsuredName(retriveText(insuredInfo, insuredNameP, 1));
order.setInsuredGender(retriveText(insuredInfo, insuredGenderP, 1));
order.setInsuredBirthday(retriveText(insuredInfo, insuredBirthdayP, 1));
order.setInsuredID(retriveText(insuredInfo, insuredIDP, 1));
}
order.setInsuredAge(retriveText(content, insuredAgeP, 1));
order.setBeneficiary(retriveText(content, beneficiaryP, 1)); //保险信息
Insurance insurance = new Insurance();
order.setInsurance(insurance);
insurance.setName(retriveText(content, insuranceNameP, 1));
insurance.setValidPeriod(retriveText(content, validPeriodP, 1).replaceAll("\r\n", ""));
insurance.setEffectiveDate(retriveText(content, effectiveDateP, 1));
insurance.setChargeWay(retriveText(content, chargeWayP, 1));
insurance.setFee(retriveText(content, feeP, 1));
insurance.setPoliceHolderCount(retriveText(content, policeHolderCount, 1)); //保险项目信息
String programList = retriveTextWithInnnerBlank(content, programListP, 1);
if(programList != null) {
String[] pArr = programList.split("\r\n");
for(String str : pArr) {
if(str != null && str.trim().length() > 0) {
String[] subArr = str.split(" ");
InsuranceProgram program = new InsuranceProgram();
order.getProgramList().add(program);
program.setName(subArr[0]);
program.setFee(subArr[1]);
}
}
}
}
} private static String retriveText(String content, Pattern p, int position) {
Matcher m = p.matcher(content);
if(m.find()) {
return m.group(position).trim().replace(" ", "");
}
return "";
} private static String retriveTextWithInnnerBlank(String content, Pattern p, int position) {
Matcher m = p.matcher(content);
if(m.find()) {
return m.group(position).trim();
}
return "";
}
}

相关实体类:InsuranceOrder .java

package com.insurance.pojo;

import java.util.ArrayList;
import java.util.List; public class InsuranceOrder { private String insurancePoliceNo; //保险单号
private String insuranceApplicationNo; //投保单号
private String policeHolderName; // 投保人
private String policeHolderBirthday; //投保人出生日期
private String policeHolderGender; //投保人性别
private String policeHolderID; // 投保人证件号码
private String insuredName; //被保险人
private String insuredGender; //被保险人性别
private String insuredBirthday; //被保险人出生日期
private String insuredID; //被保险人证件号
private String insuredAge; //被保险人投保年龄
private String beneficiary; //身故受益人及分配方式 private Insurance insurance; //险种
private List<InsuranceProgram> programList = new ArrayList<InsuranceProgram>(); //保险项目 public String getPoliceHolderBirthday() {
return policeHolderBirthday;
}
public void setPoliceHolderBirthday(String policeHolderBirthday) {
this.policeHolderBirthday = policeHolderBirthday;
}
public String getInsuredBirthday() {
return insuredBirthday;
}
public void setInsuredBirthday(String insuredBirthday) {
this.insuredBirthday = insuredBirthday;
}
public String getInsurancePoliceNo() {
return insurancePoliceNo;
}
public void setInsurancePoliceNo(String insurancePoliceNo) {
this.insurancePoliceNo = insurancePoliceNo;
}
public String getInsuranceApplicationNo() {
return insuranceApplicationNo;
}
public void setInsuranceApplicationNo(String insuranceApplicationNo) {
this.insuranceApplicationNo = insuranceApplicationNo;
}
public String getPoliceHolderName() {
return policeHolderName;
}
public void setPoliceHolderName(String policeHolderName) {
this.policeHolderName = policeHolderName;
}
public String getPoliceHolderGender() {
return policeHolderGender;
}
public void setPoliceHolderGender(String policeHolderGender) {
this.policeHolderGender = policeHolderGender;
}
public String getPoliceHolderID() {
return policeHolderID;
}
public void setPoliceHolderID(String policeHolderID) {
this.policeHolderID = policeHolderID;
}
public String getInsuredName() {
return insuredName;
}
public void setInsuredName(String insuredName) {
this.insuredName = insuredName;
}
public String getInsuredGender() {
return insuredGender;
}
public void setInsuredGender(String insuredGender) {
this.insuredGender = insuredGender;
}
public String getInsuredID() {
return insuredID;
}
public void setInsuredID(String insuredID) {
this.insuredID = insuredID;
}
public String getInsuredAge() {
return insuredAge;
}
public void setInsuredAge(String insuredAge) {
this.insuredAge = insuredAge;
}
public String getBeneficiary() {
return beneficiary;
}
public void setBeneficiary(String beneficiary) {
this.beneficiary = beneficiary;
}
public Insurance getInsurance() {
return insurance;
}
public void setInsurance(Insurance insurance) {
this.insurance = insurance;
}
public List<InsuranceProgram> getProgramList() {
return programList;
}
public void setProgramList(List<InsuranceProgram> programList) {
this.programList = programList;
}
@Override
public String toString() {
return "InsuranceOrder [insurancePoliceNo=" + insurancePoliceNo
+ ", insuranceApplicationNo=" + insuranceApplicationNo
+ ", policeHolderName=" + policeHolderName
+ ", policeHolderBirthday=" + policeHolderBirthday
+ ", policeHolderGender=" + policeHolderGender
+ ", policeHolderID=" + policeHolderID + ", insuredName="
+ insuredName + ", insuredGender=" + insuredGender
+ ", insuredBirthday=" + insuredBirthday + ", insuredID="
+ insuredID + ", insuredAge=" + insuredAge + ", beneficiary="
+ beneficiary + ", insurance=" + insurance + ", programList="
+ programList + "]";
} }

InsuranceProgram.java

package com.insurance.pojo;

/**
* 保险项目
* @author yinz
*
*/
public class InsuranceProgram { private String name; //项目名称
private String fee; //金额
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getFee() {
return fee;
}
public void setFee(String fee) {
this.fee = fee;
}
@Override
public String toString() {
return "InsuranceProgram [name=" + name + ", fee=" + fee + "]";
} }

此处用于读取的pdf文件:http://files.cnblogs.com/files/yinz/场景1_样例_电子保单识别.rar

pdfBox 读取pdf文件的更多相关文章

  1. java 用PDFBox 删除 PDF文件中的某一页

    依赖: <dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>pdfbox-app ...

  2. 深入学习python解析并读取PDF文件内容的方法

    这篇文章主要学习了python解析并读取PDF文件内容的方法,包括对学习库的应用,python2.7和python3.6中python解析PDF文件内容库的更新,包括对pdfminer库的详细解释和应 ...

  3. Java 使用PDFBox提取PDF文件中的图片

    今天做PDF文件解析,遇到一个需求:提取文件中的图片并保存.使用的是流行的apache开源jar包pdfbox, 但还是遇到坑了,比如pdfbox版本太高或太低都不能用!!这个包竟然没有很好地做好兼容 ...

  4. pdf.js如何跨域读取pdf文件?

    今天,上线一个客户网站之后(使用的是广州新一代虚拟空间)发现在读取上传的pdf文件的时候读取错误,通过直接在浏览器输入文件地址的时候发现文件地址被重定向了(呵呵!),结果就是pdf文件源由本地直接变成 ...

  5. python3用pdfminer3k在线读取pdf文件

    import importlib import sys import random from urllib.request import urlopen from urllib.request imp ...

  6. java 库 pdfbox 将 pdf 文件转换成高清图片方法

    近期需要将 pdf 文件转成高清图片,使用库是 pdfbox.fontbox.可以使用 renderImageWithDPI 方法指定转换的清晰度,当然清晰度越高,转换需要的时间越长,转换出来的图片越 ...

  7. 读取pdf文件 .选择了itextsharp 库

    此库还是比较成熟.看博客园很多文章都介绍了此库 用法 如果项目用到读取pdf.  我这只是提供个思路.或者提供个方法.用itextsharp 能方便实现 StringBuilder text = ne ...

  8. 记一次为解决Python读取PDF文件的Shell操作

    目录 一.背景 二.问题 三.解决 四.一顿分析及 Shell 操作 五.后续 一.背景 本想将 PDF 文件转换为 Word 文档,然后网上搜索了一下发现有挺多转换的软件.有的是免费的.收费,咱也不 ...

  9. C# PDFBox 解析PDF文件

    下载 PDFBox-0.7.3.zip PDFBox-0.7.3.dlllucene-demos-2.0.0.dlllucene-core-2.0.0.dllbcmail-jdk14-132.dllb ...

随机推荐

  1. Scala实战高手****第2课:Scala零基础实战入门的第一堂课及如何成为Scala高手

    val声明的不可变的战略意义:1.函数式编程中要求值不可变,val天然符合这一特性:2.在分布式系统中,一般都要求值不可变,这样才能够要求分布式系统的设计和实现,同时拥有更高的效率,val声明的内容都 ...

  2. Activity组件(传递数据)

    (一) 1.效果图:点击按钮“调用第二个Activity”,转到第二页面,之后点击“返回数据”,将第二个页面的数据传到第一个页面         2. activity_main.xml <?x ...

  3. Mac sublime 编译Python UnicodeEncodeError: 'ascii' codec can't encode characters in position 6-8: ordinal not in range(128)

    刚学Python,想打印个“hello 张林峰”,代码如下: #!/usr/bin/env python3 # -*- coding: utf-8 -*- print('hello 张林峰') 用su ...

  4. Java使用POM一JAR包的形式管理JavaScript文件-WebJars

    说明:原来JS框架还可以使用POM进行管理的.WebJars是一个很神奇的东西,可以让大家以JAR包的形式来使用前端的各种框架.组件. 什么是WebJars 什么是WebJars?WebJars是将客 ...

  5. MathType插入空格

    公式太长,换行后加一些空格,继续录. 将鼠标定位到需要插入空格的位置,此时如果直接按空格键,你会发现并不能插入空格.正确的输入方法有两种: 方法一,在菜单栏中[样式]菜单下选择[文本],随后按空格键即 ...

  6. Docker实践3: Docker常用命令(未完)

    查看容器及运行进程 docker ps 查看容器内部信息 docker inspect container_id 进入容器 docker attach container_id 退出容器 docker ...

  7. PostgreSQL配置文件--资源使用(除WAL外)

    2 资源使用(除WAL外) RESOURCE USAGE (except for WAL) 2.1 内存 Memory 2.1.1 shared_buffers 数字型 默认: shared_buff ...

  8. PHP5实现foreach语言结构遍历一个类的实例

    PHP5实现foreach语言结构遍历一个类 创建一个类集成Iterator接口,并实现Iterator里面的方法即可,下面见实例代码实现 <?php class Test implements ...

  9. mysql 5.7.13 安装配置方法(linux)-后期部分运维

    mysql 5.7.13 安装配置方法图文教程(linux) 学习了:https://www.cnblogs.com/zhao1949/p/5947938.html /usr/local/mysql是 ...

  10. Java实现算法之--选择排序

    选择排序也是比較简单的一种排序方法,原理也比較easy理解,它与冒泡排序的比較次数同样,但选择排序的交换次数少于冒泡排序.冒泡排序是在每次比較之后,若比較的两个元素顺序与待排序顺序相反,则要进行交换, ...