手动解析Excel获取文件元数据

工作中有遇到需要获取上传的Excel文件的列明、最大行数、大小等元数据信息。通常做法是通过Apache的POI工具加载文件然后再读取行列进行处理。这种方法很大的弊端就是需要把excel文件加载到内存，如果遇到大的文件，内存暴增，很容易出现OOM。为了解决这个问题，我研究了excel文件的格式，写了一工具类来自己解析和获取这些信息。

一、excel文件格式解析

其实xls、xlsx格式的文件其实就是一个压缩包，我们找一个excel文件，把后缀改成.rar,然后解压，你会发现文件夹里面大概是这样的：

其中关键的是xl这个文件夹，看第二张图：

1、workbook.xml 里面包含了sheet的信息，比如有几个sheet，每一个的名称是什么

2、sharedString.xml 老重要了，里面就是包含了整个excel文件中单元格中的内容，excel是通过索引来引用内容的。

3、worksheets 文件夹里面包含了sheet内容的定义

看第三张图，sheet1.xml表示第一个sheet的定义，其内容是这样的：

看到那些数字了吗，其实表示这个单元格的内容在sharedString.xml中的索引。

二、示例代码实现

接下来我将展示一个获取excel文件中列名称、行数、sheet名称的java代码。

import java.io.File;

import java.io.RandomAccessFile;

import java.io.UnsupportedEncodingException;

import java.nio.ByteBuffer;

import java.nio.CharBuffer;

import java.nio.channels.FileChannel;

import java.nio.charset.Charset;

import java.nio.charset.CharsetDecoder;

import java.util.ArrayList;

import java.util.Collections;

import java.util.List;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

/**

 * excel文件元数据读取工具

 *

 * @author yuananyun

 * @date 2017/11/16 14:20

 **/

public class ExcelXmlUtil {

    //获取第一个sheet的名称的表达式

    private static Pattern firstSheetPattern = Pattern.compile("sheet name=\"(.*?)\" sheetId=\"1\"");

    //抽取一行的表达式，如

    //<row r="200001" spans="1:2" s="1" customFormat="1" x14ac:dyDescent="0.15">/row>

    private static Pattern rowPattern = Pattern.compile("<row(.*?)></row>");

    //求解一行行号的表达式

    private static Pattern rowNumPattern = Pattern.compile("r=\"(\\d+)\"");

    //求解标题列个数的表达式

    private static Pattern columnCountPattern = Pattern.compile("</v>");

    //求解列标题索引的表达式

    private static Pattern columnIndexPattern = Pattern.compile("<v>(\\d*)</v>");

    //求解列标题名称的表达式

    private static Pattern titleValuePattern = Pattern.compile("(?:(?:<t>)|(?:<t xml:space=\".*\">))([\\s\\S]*?)</t>");

    static class ExcelRowColumnInfo {

        private long maxRowNum;

        private int coluntCount;

        private List<String> titleList;

        private String firstSheetName;

        public ExcelRowColumnInfo(String firstSheetName, int maxRowNum, int coluntCount, List<String> titleList) {

            this.firstSheetName = firstSheetName;

            this.maxRowNum = maxRowNum;

            this.coluntCount = coluntCount;

            this.titleList = titleList;

        }

        public long getMaxRowNum() {

            return maxRowNum;

        }

        public void setMaxRowNum(int maxRowNum) {

            this.maxRowNum = maxRowNum;

        }

        public int getColuntCount() {

            return coluntCount;

        }

        public void setColuntCount(int coluntCount) {

            this.coluntCount = coluntCount;

        }

        public List<String> getTitleList() {

            return titleList == null ? new ArrayList<>() : titleList;

        }

        public void setTitleList(List<String> titleList) {

            this.titleList = titleList;

        }

        public String getFirstSheetName() {

            return firstSheetName;

        }

        public void setFirstSheetName(String firstSheetName) {

            this.firstSheetName = firstSheetName;

        }

        @Override

        public String toString() {

            return "ExcelRowColumnInfo{" +

                    "maxRowNum=" + maxRowNum +

                    ", coluntCount=" + coluntCount +

                    ", titleList=" + titleList.toString() +

                    '}';

        }

    }

    /**

     * 获取excel文件的行列个数

     *

     * @param excelFilePath

     * @param isOverwrite   是否覆盖源excel文件

     * @return ExcelRowColumnInfo

     */

    public static ExcelRowColumnInfo getRowAndColumnInfo(String excelFilePath, boolean isOverwrite) {

        try {

            File excelFile = new File(excelFilePath);

            if (!excelFile.exists()) return null;

            String zipFilePath = excelFilePath.replace(".xlsx", ".zip").replace(".xls", ".zip");

            File zipFile = new File(zipFilePath);

            if (zipFile.exists()) zipFile.delete();

            if (isOverwrite) {

                //直接重命名

                excelFile.renameTo(zipFile);

            } else {

                // 复制文件

                FileUtil.copyFile(excelFilePath, zipFilePath);

            }

            //解压的临时目录

            String tmpDir = zipFilePath.replace(".zip", "");

            List<File> fileList = ZipUtils.upzipFile(zipFile, tmpDir);

            File sheet1File = null;

            File sharedStringsFile = null;

            File workbookFile = null;

            for (File file : fileList) {

                if (file.getPath().contains("sheet1.xml"))

                    sheet1File = file;

                if (file.getPath().contains("sharedStrings.xml"))

                    sharedStringsFile = file;

                if (file.getPath().contains("workbook.xml"))

                    workbookFile = file;

            }

            if (sheet1File == null || sharedStringsFile == null) return null;

            //抽取sheet名称

            String sheetName = parseFirstSheetName(workbookFile);

            int[] rcArray = parseMaxRowNumAndColCount(sheet1File);

            int maxRowNum = rcArray[0];

//            int columCount = rcArray[1];

            int[] titleIndexArray = parseTitleIndexArray(sheet1File);

            List<String> titleList = parseTitleList(sharedStringsFile, titleIndexArray);

            deleteFileRecursively(zipFile);

            deleteFileRecursively(new File(tmpDir));

            if (titleList == null || titleList.size() == 0 || maxRowNum == 0) return null;

            return new ExcelRowColumnInfo(sheetName, maxRowNum, titleList.size(), titleList);

        } catch (Exception ex) {

            ex.printStackTrace();

        }

        return null;

    }

    /**

     * 解析第一个sheet的名称

     *

     * @param workbookFile

     * @return

     */

    private static String parseFirstSheetName(File workbookFile) {

        String content = getFileSegment(workbookFile, 0, Integer.MAX_VALUE);

        Matcher matcher = firstSheetPattern.matcher(content);

        if (matcher.find())

            return matcher.group(1);

        return null;

    }

    /**

     * 求解标题列关键字所在的索引

     *

     * @param sheet1File

     * @return

     */

    private static int[] parseTitleIndexArray(File sheet1File) {

        int realColCount = 0;

        String startSegment = getFileSegment(sheet1File, 2000);

        if (startSegment != null) {

            //求解真实的列数

            Matcher matcher = rowPattern.matcher(startSegment);

            if (matcher.find()) {

                String firstRow = matcher.group(1);

                if (firstRow != null) {

                    matcher = columnCountPattern.matcher(firstRow);

                    while (matcher.find())

                        realColCount++;

                }

            }

            if (realColCount > 0) {

                //求解标题

                int[] titleIndexArray = new int[realColCount];

                matcher = columnIndexPattern.matcher(startSegment);

                int i = 0;

                while (matcher.find() && i < realColCount) {

                    titleIndexArray[i++] = Integer.parseInt(matcher.group(1));

                }

                return titleIndexArray;

            }

        }

        return null;

    }

    /**

     * 解析excel文件的标题列名称

     *

     * @param sharedStringsFile

     * @param titleIndexArray

     * @return

     */

    private static List<String> parseTitleList(File sharedStringsFile, int[] titleIndexArray) {

        List<String> titleList = new ArrayList<>();

        int count = titleIndexArray.length;

        if (count > 0) {

            int minIndex = Integer.MAX_VALUE;

            int maxIndex = Integer.MIN_VALUE;

            for (int i = 0; i < count; i++) {

                int index = titleIndexArray[i];

                if (index > maxIndex) maxIndex = index;

                if (index < minIndex) minIndex = index;

            }

            //885是头部的长度，限制每个row长度为200字符

//            int length = (885 + (maxIndex - minIndex + 1) * 200);

            //标题真的是到处都在，

            String[] titleArray = new String[count];

//            if (minIndex > 10000) {

//                //这是一个大文档,整篇加载

//                length = Integer.MAX_VALUE;

//            }

            String segment = getFileSegment(sharedStringsFile, 0, Integer.MAX_VALUE);

            Matcher matcher = titleValuePattern.matcher(segment);

            int i = 0;

            while (matcher.find() && count > 0) {

                String value = matcher.group(1);

//                System.out.println(i + " ------> " + value);

                for (int j = 0; j < titleIndexArray.length; j++) {

                    if (i == titleIndexArray[j]) {

                        titleArray[j] = value;

                        count--;

                        break;

                    }

                }

                i++;

            }

            if (titleArray.length > 0) {

                Collections.addAll(titleList, titleArray);

                //去掉空格单元格

                Collections.reverse(titleList);

                for (int i1 = 0; i1 < titleList.size(); i1++) {

                    String title = String.valueOf(titleList.get(i1));

                    if ("".equals(title.trim()))

                        titleList.remove(i1);

                }

                Collections.reverse(titleList);

            }

        }

        return titleList;

    }

    /**

     * 解析文件的最大行号和列数

     *

     * @param sheet1File

     * @return

     */

    private static int[] parseMaxRowNumAndColCount(File sheet1File) {

        int rowNum = 0, colCount = 0;

        String endSegment = getFileSegment(sheet1File, -1000);

        if (endSegment != null) {

            Matcher matcher = rowPattern.matcher(endSegment);

            String lastRow = "";

            while (matcher.find()) {

                lastRow = matcher.group(1);

            }

            if (lastRow.length() > 0) {

                matcher = rowNumPattern.matcher(lastRow);

                if (matcher.find())

                    rowNum = Integer.parseInt(matcher.group(1));

                matcher = columnCountPattern.matcher(lastRow);

                while (matcher.find())

                    colCount++;

            }

        }

        return new int[]{rowNum, colCount};

    }

    /**

     * 递归删除文件及文件夹

     *

     * @param file

     */

    private static void deleteFileRecursively(File file) {

        if (file.exists()) {

            if (file.isFile()) {

                file.delete();

            } else if (file.isDirectory()) {

                File[] files = file.listFiles();

                for (int i = 0; i < files.length; i++) {

                    deleteFileRecursively(files[i]);

                }

                file.delete();

            }

        }

    }

    private static String getFileSegment(File file, int length) {

        return getFileSegment(file, 0, length);

    }

    /**

     * 从一个文件中截取一段字符串

     *

     * @param file

     * @param offset

     * @param length length<0时，offset将失效

     * @return

     */

    private static String getFileSegment(File file, long offset, int length) {

        if (file == null || !file.exists()) return null;

        try {

            Charset charset = Charset.forName("UTF-8");

            CharsetDecoder decoder = charset.newDecoder();

            StringBuilder builder = new StringBuilder();

            RandomAccessFile aFile = new RandomAccessFile(file, "r");

            FileChannel inChannel = aFile.getChannel();

            if (inChannel != null) {

                if (Integer.MAX_VALUE == length)

                    length = (int) inChannel.size();

                ByteBuffer buf = ByteBuffer.allocate(Math.abs(length));

                if (length < 0)

                    offset = inChannel.size() + length;

                int size = Math.abs(length);

                inChannel.position(offset < 0 ? 0 : offset);

                int bytesRead = inChannel.read(buf);

                while (bytesRead != -1 && size > 0) {

                    buf.flip();

                    CharBuffer charBuffer = decoder.decode(buf);

                    builder.append(charBuffer);

                    buf.clear();

                    bytesRead = inChannel.read(buf);

                    size = size - bytesRead;

                }

                inChannel.close();

            }

            aFile.close();

            return builder.toString();

        } catch (Exception ex) {

            ex.printStackTrace();

        }

        return null;

    }

    /**

     * 测试

     * @param args

     * @throws UnsupportedEncodingException

     */

    public static void main(String[] args) throws UnsupportedEncodingException {

        ExcelRowColumnInfo result;

        result = getRowAndColumnInfo("D:\\元数据求解.xls", false);

        System.out.println(result);

    }

}

用到的几个工具类：

/**

     * 文件复制

     * @param srcFilePath

     * @param destFilePath

     * @return

     */

    public static String copyFile(String srcFilePath, String destFilePath){

        if (StringUtils.isEmpty(srcFilePath) || StringUtils.isEmpty(destFilePath)){

            return null;

        }

        File srcFile = new File(srcFilePath);

        File destFile = new File(destFilePath);

        if (!srcFile.exists() || srcFile.isDirectory()){

            return null;

        }

        try {

            if (!destFile.exists()) {

                destFile.createNewFile();

            }

            FileUtils.copyFile(srcFile, destFile);

            return destFilePath;

        } catch (IOException e){

            e.printStackTrace();

        }

        return null;

    }

    /**

     * 对.zip文件进行解压缩

     *

     * @param zipFile 解压缩文件

     * @param descDir 解压缩的目标地址，如：D:\\测试 或 /mnt/d/测试

     * @return

     */

    @SuppressWarnings("rawtypes")

    public static List<File> upzipFile(File zipFile, String descDir) {

        List<File> _list = new ArrayList<File>();

        try {

            ZipFile _zipFile = new ZipFile(zipFile, "GBK");

            for (Enumeration entries = _zipFile.getEntries(); entries.hasMoreElements(); ) {

                ZipEntry entry = (ZipEntry) entries.nextElement();

                File _file = new File(descDir + File.separator + entry.getName());

                if (entry.isDirectory()) {

                    _file.mkdirs();

                } else {

                    File _parent = _file.getParentFile();

                    if (!_parent.exists()) {

                        _parent.mkdirs();

                    }

                    InputStream _in = _zipFile.getInputStream(entry);

                    OutputStream _out = new FileOutputStream(_file);

                    int len = 0;

                    while ((len = _in.read(_byte)) > 0) {

                        _out.write(_byte, 0, len);

                    }

                    _in.close();

                    _out.flush();

                    _out.close();

                    _list.add(_file);

                }

            }

            _zipFile.close();

        } catch (IOException e) {

        }

        return _list;

    }

其中zip用的是

org.apache.tools.zip.ZipEntry;