前言
记录一次解决poi读取excel内存溢出问题的过程,使用poi的sax解析方式处理海量数据的excel,也包括对空单元的处理及日期格式转化。
解决过程
sax事件驱动解析与原来的解析方式最大的区别就是,sax将excel转换成xml格式然后一行去读取,而不是一下将excel解析到内存中,这样就可以有效的解决由数据量大导致的内存溢出问题了,sax解析时会忽略掉空格,这一点要单独去处理。下面用代码来说话。
1.sax将excel转换成xml
为了更好的使用sax首先看看被转化成xml是什么样子的。
public static void main(String[] args) {
String filename ="D:\\8月\\otc\\sax解析测试\\0221otcposdata20190909.xlsm";
OPCPackage pkg;
try {
pkg = OPCPackage.open(filename);
XSSFReader r = new XSSFReader(pkg);
//查看转换的xml原始文件,方便理解后面解析时的处理,
InputStream in = r.getSheet("rId1");
byte[] buf = new byte[1024];
int len;
while ((len = in.read(buf)) != -1) {
System.out.write(buf, 0, len);
}
} catch (Exception e) {
e.printStackTrace();
}
}
解析后的样子,发现有好多标签,cols、col、row、 c 、v等等这些就是excel中的元素,后面的解析会用到。
2.解析excel2007
通过继承DefaultHandler类,重写process(),startElement(),characters(),endElement()这四个方法。
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.ss.usermodel.BuiltinFormats;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.eventusermodel.XSSFReader.SheetIterator;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFCellStyle;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
public class ExcelXlsxReader extends DefaultHandler {
private IExcelRowReader rowReader;
public void setRowReader(IExcelRowReader rowReader) {
this.rowReader = rowReader;
}
private SharedStringsTable sst;
private StylesTable stylesTable;
private String lastContents;
private int sheetIndex = -1;
private CellDataType nextDataType = CellDataType.SSTINDEX;
private final DataFormatter formatter = new DataFormatter();
private short formatIndex;
private String formatString;
private List<List<String>> templist = new ArrayList<List<String>>();
private List<String> rowList = null;
private boolean allowNullRow = true;
private String dimension;
private int longest;
private List<String> currentRow;
private boolean isSSTIndex = false;
private String sheetName = null;
/**
* 遍历工作簿中所有的电子表格
*
* @param filename
* @throws IOException
* @throws OpenXML4JException
* @throws SAXException
* @throws Exception
*/
public void process(String filename,String newFileName,String fileType) throws IOException,
OpenXML4JException, SAXException {
OPCPackage pkg = OPCPackage.open(filename);
XSSFReader r = new XSSFReader(pkg);
stylesTable = r.getStylesTable();
SharedStringsTable sst = r.getSharedStringsTable();
XMLReader parser = fetchSheetParser(sst);
SheetIterator sheets = (SheetIterator) r.getSheetsData();
while (sheets.hasNext()) {
InputStream sheet = sheets.next();
sheetName = sheets.getSheetName();
sheetIndex++;
InputSource sheetSource = new InputSource(sheet);
parser.parse(sheetSource);
sheet.close();
}
//写文件
rowReader.writeExcel(templist,sheetName,newFileName,fileType);
templist = null;
}
public XMLReader fetchSheetParser(SharedStringsTable sst)
throws SAXException {
XMLReader parser = XMLReaderFactory
.createXMLReader("org.apache.xerces.parsers.SAXParser");
this.sst = sst;
parser.setContentHandler(this);
return parser;
}
@Override
public void characters(char[] ch, int start, int length)
throws SAXException {
// 得到单元格内容的值
lastContents += new String(ch, start, length);
}
@Override
public void startElement(String uri, String localName, String qName,
Attributes attributes) throws SAXException {
if (qName.equals("dimension")) {
dimension = attributes.getValue("ref");
longest = covertRowIdtoInt(dimension.substring(dimension
.indexOf(":") + 1));
}
//行
if (qName.equals("row")) {
currentRow = new ArrayList<String>();
}
//单元格
if (qName.equals("c")) {
// 判断单元格的值是SST的索引
if (attributes.getValue("t") != null
&& attributes.getValue("t").equals("s")) {
isSSTIndex = true;
nextDataType = CellDataType.SSTINDEX;
} else {
isSSTIndex = false;
// 当单元格的值不是SST的索引放一个空值占位。
currentRow.add("");
// 判断单元格格式类型
setNextDataType(attributes);
}
}
lastContents = "";
}
@Override
public void endElement(String uri, String localName, String qName)
throws SAXException {
// 行结束,存储一行数据
if (qName.equals("row")) {
if (isWriteRow(currentRow)) {
rowList = new ArrayList<String>();
rowList.addAll(currentRow);
templist.add(rowList);
rowList = null;
}
}
// 取值
if (qName.equals("v")) {
// 单元格的值是SST的索引
if (isSSTIndex) {
lastContents = getDataValue(lastContents);
} else {
// 单元格的值不是SST的索引,删除占位空格,再取值
currentRow.remove(currentRow.size() - 1);
lastContents = getDataValue(lastContents);
}
currentRow.add(lastContents);
}
}
/**
* 单元格中的数据类型枚举
*/
enum CellDataType {
SSTINDEX, NUMBER, DATE, NULL
}
/**
* 在取值前处理单元格数据类型,目前只对日期格式进行处理,可扩展。
*/
public void setNextDataType(Attributes attributes) {
nextDataType = CellDataType.NUMBER;
formatIndex = -1;
formatString = null;
String dateFormat = "m/d/yy,yyyy\\-mm\\-dd;@,yyyy/m/d;@,yyyy/m/d\\ h:mm;@,mm/dd/yy;@,m/d;@,"
+ "yy/m/d;@,m/d/yy;@,[$-409]yyyy/m/d\\ h:mm\\ AM/PM;@,[$-F800]dddd\\,\\ mmmm\\ dd\\,\\ yyyy";
String cellStyleStr = attributes.getValue("s");
if (cellStyleStr != null) {
int styleIndex = Integer.parseInt(cellStyleStr);
XSSFCellStyle style = stylesTable.getStyleAt(styleIndex);
formatIndex = style.getDataFormat();
formatString = style.getDataFormatString();
// 对日期类型进行处理
if (dateFormat.contains(formatString)) {
nextDataType = CellDataType.DATE;
formatString = "yyyy-MM-dd";
}
if (formatString == null) {
nextDataType = CellDataType.NULL;
formatString = BuiltinFormats.getBuiltinFormat(formatIndex);
}
}
}
private boolean isWriteRow(List<String> list) {
boolean flag = false;
for (int i = 0; i < list.size(); i++) {
if (list.get(i) != null && !list.get(i).equals("")) {
flag = true;
break;
}
}
if (allowNullRow) {
if (!flag) {
flag = true;// 允许一次空行
allowNullRow = false;
}
}
return flag;
}
public static int covertRowIdtoInt(String rowId) {
int firstDigit = -1;
for (int c = 0; c < rowId.length(); ++c) {
if (Character.isDigit(rowId.charAt(c))) {
firstDigit = c;
break;
}
}
String newRowId = rowId.substring(0, firstDigit);
int num = 0;
int result = 0;
int length = newRowId.length();
for (int i = 0; i < length; i++) {
char ch = newRowId.charAt(length - i - 1);
num = (int) (ch - 'A' + 1);
num *= Math.pow(26, i);
result += num;
}
return result;
}
/**
* 对解析出来的数据进行类型处理
*/
public String getDataValue(String value) {
String thisStr = null;
switch (nextDataType) {
case SSTINDEX:
String sstIndex = value.toString();
try {
int idx = Integer.parseInt(sstIndex);
XSSFRichTextString rtss = new XSSFRichTextString(sst
.getEntryAt(idx));
thisStr = rtss.toString();
rtss = null;
} catch (NumberFormatException ex) {
thisStr = value.toString();
}
break;
case DATE:
thisStr = formatter.formatRawCellContents(
Double.parseDouble(value), formatIndex, formatString);
break;
default:
thisStr = value;
break;
}
return thisStr;
}
}
3.解析excel2003
与读取excel2007有所区别,Excel2003需要通过继承HSSFListener类来解析,重写process()、processRecord()两个方法,在processRecord中进行读取操作。
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.poi.hssf.eventusermodel.EventWorkbookBuilder.SheetRecordCollectingListener;
import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
import org.apache.poi.hssf.eventusermodel.HSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFRequest;
import org.apache.poi.hssf.eventusermodel.MissingRecordAwareHSSFListener;
import org.apache.poi.hssf.eventusermodel.dummyrecord.LastCellOfRowDummyRecord;
import org.apache.poi.hssf.eventusermodel.dummyrecord.MissingCellDummyRecord;
import org.apache.poi.hssf.model.HSSFFormulaParser;
import org.apache.poi.hssf.record.BOFRecord;
import org.apache.poi.hssf.record.BlankRecord;
import org.apache.poi.hssf.record.BoolErrRecord;
import org.apache.poi.hssf.record.BoundSheetRecord;
import org.apache.poi.hssf.record.FormulaRecord;
import org.apache.poi.hssf.record.LabelRecord;
import org.apache.poi.hssf.record.LabelSSTRecord;
import org.apache.poi.hssf.record.NumberRecord;
import org.apache.poi.hssf.record.Record;
import org.apache.poi.hssf.record.SSTRecord;
import org.apache.poi.hssf.record.StringRecord;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
public class ExcelXlsReader implements HSSFListener {
private int minColumns = -1;
private POIFSFileSystem fs;
private int lastRowNumber;
private int lastColumnNumber;
/** Should we output the formula, or the value it has? */
private boolean outputFormulaValues = true;
/** For parsing Formulas */
private SheetRecordCollectingListener workbookBuildingListener;
// excel2003工作薄
private HSSFWorkbook stubWorkbook;
// Records we pick up as we process
private SSTRecord sstRecord;
private FormatTrackingHSSFListener formatListener;
// 表索引
private int sheetIndex = -1;
private BoundSheetRecord[] orderedBSRs;
@SuppressWarnings("unchecked")
private ArrayList boundSheetRecords = new ArrayList();
// For handling formulas with string results
private int nextRow;
private int nextColumn;
private boolean outputNextStringRecord;
// 当前行
private int curRow = 0;
// 存储行记录的容器
private List<String> rowlist = new ArrayList<String>();;
private boolean allowNullRow = true;
@SuppressWarnings("unused")
private String sheetName;
private List<List<String>> templist = new ArrayList<List<String>>();
private List<String> r = null;
private IExcelRowReader rowReader;
public void setRowReader(IExcelRowReader rowReader) {
this.rowReader = rowReader;
}
/**
* 遍历excel下所有的sheet
*
* @throws IOException
*/
public void process(String fileName,String newFileName,String fileType) throws IOException {
this.fs = new POIFSFileSystem(new FileInputStream(fileName));
MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener(this);
formatListener = new FormatTrackingHSSFListener(listener);
HSSFEventFactory factory = new HSSFEventFactory();
HSSFRequest request = new HSSFRequest();
if (outputFormulaValues) {
request.addListenerForAllRecords(formatListener);
} else {
workbookBuildingListener = new SheetRecordCollectingListener(formatListener);
request.addListenerForAllRecords(workbookBuildingListener);
}
factory.processWorkbookEvents(request, fs);
rowReader.writeExcel(templist,sheetName,newFileName,fileType);
templist = null;
}
/**
* HSSFListener 监听方法,处理 Record
*/
@SuppressWarnings("unchecked")
public void processRecord(Record record) {
int thisRow = -1;
int thisColumn = -1;
String thisStr = null;
String value = null;
switch (record.getSid()) {
case BoundSheetRecord.sid:
boundSheetRecords.add(record);
break;
case BOFRecord.sid:
BOFRecord br = (BOFRecord) record;
if (br.getType() == BOFRecord.TYPE_WORKSHEET) {
// 如果有需要,则建立子工作薄
if (workbookBuildingListener != null && stubWorkbook == null) {
stubWorkbook = workbookBuildingListener.getStubHSSFWorkbook();
}
sheetIndex++;
if (orderedBSRs == null) {
orderedBSRs = BoundSheetRecord.orderByBofPosition(boundSheetRecords);
}
sheetName = orderedBSRs[sheetIndex].getSheetname();
//ExcelRowReader.SHEETNAME = sheetName;
}
break;
case SSTRecord.sid:
sstRecord = (SSTRecord) record;
break;
case BlankRecord.sid:
BlankRecord brec = (BlankRecord) record;
thisRow = brec.getRow();
thisColumn = brec.getColumn();
thisStr = "";
rowlist.add(thisColumn, thisStr);
break;
case BoolErrRecord.sid: // 单元格为布尔类型
BoolErrRecord berec = (BoolErrRecord) record;
thisRow = berec.getRow();
thisColumn = berec.getColumn();
thisStr = berec.getBooleanValue() + "";
rowlist.add(thisColumn, thisStr);
break;
case FormulaRecord.sid: // 单元格为公式类型
FormulaRecord frec = (FormulaRecord) record;
thisRow = frec.getRow();
thisColumn = frec.getColumn();
if (outputFormulaValues) {
if (Double.isNaN(frec.getValue())) {
outputNextStringRecord = true;
nextRow = frec.getRow();
nextColumn = frec.getColumn();
} else {
thisStr = formatListener.formatNumberDateCell(frec);
}
} else {
thisStr = '"' + HSSFFormulaParser.toFormulaString(stubWorkbook, frec.getParsedExpression()) + '"';
}
rowlist.add(thisColumn, thisStr);
break;
case StringRecord.sid:// 单元格中公式的字符串
if (outputNextStringRecord) {
StringRecord srec = (StringRecord) record;
thisStr = srec.getString();
thisRow = nextRow;
thisColumn = nextColumn;
outputNextStringRecord = false;
}
break;
case LabelRecord.sid:
LabelRecord lrec = (LabelRecord) record;
curRow = thisRow = lrec.getRow();
thisColumn = lrec.getColumn();
value = lrec.getValue().trim();
value = value.equals("") ? " " : value;
this.rowlist.add(thisColumn, value);
break;
case LabelSSTRecord.sid: // 单元格为字符串类型
LabelSSTRecord lsrec = (LabelSSTRecord) record;
curRow = thisRow = lsrec.getRow();
thisColumn = lsrec.getColumn();
if (sstRecord == null) {
rowlist.add(thisColumn, " ");
} else {
value = sstRecord.getString(lsrec.getSSTIndex()).toString().trim();
value = value.equals("") ? " " : value;
rowlist.add(thisColumn, value);
}
break;
case NumberRecord.sid: // 单元格为数字类型
NumberRecord numrec = (NumberRecord) record;
curRow = thisRow = numrec.getRow();
thisColumn = numrec.getColumn();
value = formatListener.formatNumberDateCell(numrec).trim();
value = value.equals("") ? " " : value;
// 向容器加入列值
rowlist.add(thisColumn, value);
break;
default:
break;
}
// 遇到新行的操作
if (thisRow != -1 && thisRow != lastRowNumber) {
lastColumnNumber = -1;
}
// 空值的操作
if (record instanceof MissingCellDummyRecord) {
MissingCellDummyRecord mc = (MissingCellDummyRecord) record;
curRow = thisRow = mc.getRow();
thisColumn = mc.getColumn();
rowlist.add(thisColumn, " ");
}
// 更新行和列的值
if (thisRow > -1)
lastRowNumber = thisRow;
if (thisColumn > -1)
lastColumnNumber = thisColumn;
// 行结束时的操作
if (record instanceof LastCellOfRowDummyRecord) {
if (minColumns > 0) {
// 列值重新置空
if (lastColumnNumber == -1) {
lastColumnNumber = 0;
}
}
lastColumnNumber = -1;
// 每行结束时, 调用getRows() 方法
if(isWriteRow(rowlist)){
//rowReader.getRows(sheetIndex, curRow, rowlist);
r = new ArrayList<String>();
r.addAll(rowlist);
templist.add(r);
r = null;
}
// 清空容器
rowlist.clear();
}
}
private boolean isWriteRow(List list){
boolean flag = false;
for (int i = 0; i < list.size(); i++) {
if (list.get(i)!= null && !list.get(i).equals("")) {
flag = true;
break;
}
}
if (allowNullRow) {
if (!flag) {
flag = true;//允许一次空行
allowNullRow = false;
}
}
return flag;
}
}