目录
- 一、获取doc、docx、xls、xlsx、ppt、pptx中文本
- 1、统一方法
- 1.1、pom(支持doc、docx、xls、xlsx、ppt、pptx)
- 1.2、代码
- 1.3、结果
- 2、分散方法
- 2.1、pom
- 2.2、代码
- 2.3、结果
- 二、获取html中文本
- 1、pom
- 2、代码
- 3、结果
- 三、问题解决
- 1、java.io.IOException: Zip bomb detected!
- (1)背景
- (2)错误
- (3)解决办法
- (4)参考资料
一、获取doc、docx、xls、xlsx、ppt、pptx中文本
1、统一方法
1.1、pom(支持doc、docx、xls、xlsx、ppt、pptx)
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.8</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>ooxml-schemas</artifactId>
<version>1.4</version>
</dependency>
1.2、代码
import org.apache.poi.POITextExtractor;
import org.apache.poi.extractor.ExtractorFactory;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
public class Test {
public static void main(String[] args) {
System.out.println(toText(new File("C:\\test\\20221115\\555.pptx")));
}
/**
* 从doc,docx,xls,xlsx,ppt,pptx类型文件中抽取文本
*
* @param file 文件
* @return 文本
* @date 2023/3/15 1:50
**/
public static String toText(File file) {
FileInputStream input = null;
try {
input = new FileInputStream(file);
POITextExtractor extractor = ExtractorFactory
.createExtractor(input);
String content = extractor.getText();
return content;
} catch (Exception e) {
} finally {
if (input != null) {
try {
input.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return null;
}
}
1.3、结果
2、分散方法
2.1、pom
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.8</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.8</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.8</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml-schemas</artifactId>
<version>3.8</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>ooxml-schemas</artifactId>
<version>1.1</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.9</version>
</dependency>
<dependency>
<groupId>com.itextpdf</groupId>
<artifactId>itextpdf</artifactId>
<version>5.5.3</version>
</dependency>
2.2、代码
import org.apache.pdfbox.io.RandomAccess;
import org.apache.pdfbox.io.RandomAccessFile;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.usermodel.SlideShow;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.dom4j.Document;
import org.dom4j.io.SAXReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.text.SimpleDateFormat;
import java.util.Date;
/**
* 获取文件中的文本
*
* @author 明快de玄米61
* @date 2022/12/8 23:50
*/
public class Test {
public static void main(String[] args) {
// 测试获取doc中的文本
System.out.println(extractDoc("C:\\test\\20221115\\222.doc"));
// 测试获取docx中的文本
System.out.println(extractDocx("C:\\test\\20221115\\111.docx"));
// 测试获取xls中的文本
System.out.println(extractXls("C:\\test\\20221115\\444.xls"));
// 测试获取xlsx中的文本
System.out.println(extractXlsx("C:\\test\\20221115\\333.xlsx"));
// 测试获取ppt中的文本
System.out.println(extractPpt("C:\\test\\20221115\\555.ppt"));
// 测试获取pptx中的文本
System.out.println(extractPptx("C:\\test\\20221115\\555.pptx"));
// 测试获取pdf中的文本
System.out.println(extractPdf("C:\\test\\20221115\\888.pdf"));
// 测试获取xml中的文本
System.out.println(extractXml("C:\\test\\20221115\\1000.xml"));
}
/**
* 获取doc中的文本
*
* @param filePath 文件全路径
* @return 文本
* @author 明快de玄米61
* @date 2022/12/18 23:22
**/
private static String extractDoc(String filePath) {
try (
InputStream in = new FileInputStream(filePath)
) {
WordExtractor extractor = new WordExtractor(in);
return extractor.getText();
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
/**
* 获取docx中的文本
*
* @param filePath 文件全路径
* @return 文本
* @author 明快de玄米61
* @date 2022/12/18 23:22
**/
private static String extractDocx(String filePath) {
try (
InputStream in = new FileInputStream(filePath)
) {
XWPFDocument xdoc = new XWPFDocument(in);
XWPFWordExtractor extractor = new XWPFWordExtractor(xdoc);
return extractor.getText();
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
/**
* 获取xls中的文本
*
* @param filePath 文件全路径
* @return 文本
* @author 明快de玄米61
* @date 2022/12/18 23:22
**/
private static String extractXls(String filePath) {
try (
InputStream in = new FileInputStream(filePath)
) {
Workbook workbook = new HSSFWorkbook(in);
return workbook2String(workbook);
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
/**
* 获取xlsx中的文本
*
* @param filePath 文件全路径
* @return 文本
* @author 明快de玄米61
* @date 2022/12/18 23:22
**/
private static String extractXlsx(String filePath) {
try (
InputStream in = new FileInputStream(filePath)
) {
Workbook workbook = new XSSFWorkbook(in);
return workbook2String(workbook);
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
private static String workbook2String(Workbook workbook) {
StringBuffer sb = new StringBuffer();
if (workbook != null) {
for (int sheetNum = 0; sheetNum < workbook.getNumberOfSheets(); sheetNum++) {
// 获得当前sheet工作表
Sheet sheet = workbook.getSheetAt(sheetNum);
if (sheet == null) {
continue;
}
// 获得当前sheet的开始行
int firstRowNum = sheet.getFirstRowNum();
// 获得当前sheet的结束行
int lastRowNum = sheet.getLastRowNum();
// 循环除了第一行的所有行
sb.append(sheet.getSheetName() + "\n");
for (int rowNum = firstRowNum; rowNum <= lastRowNum; rowNum++) {
// 获得当前行
Row row = sheet.getRow(rowNum);
if (row == null || row.getPhysicalNumberOfCells() == 0) {
/* String[] a=new String[0]; list.add(a); */
// sb.append("\r\n");
continue;
} // 获得当前行的开始列
int firstCellNum = row.getFirstCellNum();
// 获得当前行的列数
int lastCellNum = row.getLastCellNum();
// String[] cells = new
// String[row.getPhysicalNumberOfCells()];
// 循环当前行
for (int cellNum = firstCellNum; cellNum < lastCellNum; cellNum++) {
Cell cell = row.getCell(cellNum);
String value = cellValue(cell);
if (!"".equals(value)) {
sb.append(value);
sb.append(",");
}
}
}
sb.append("\n");
}
}
return sb.toString();
}
/**
* 获得Excel格子里的内容
*
* @param cell 每个格子
* @return
*/
private static String cellValue(Cell cell) {
String cellValue = "";
if (cell == null) {
return cellValue;
}
// 把数字当成String来读,避免出现1读成1.0的情况
// if (cell.getCellType() == Cell.CELL_TYPE_NUMERIC) {
// // cell.setCellType(Cell.CELL_TYPE_STRING);
//
// }
// 判断数据的类型
switch (cell.getCellType()) {
case Cell.CELL_TYPE_NUMERIC: // 数字
// cellValue = String.valueOf(cell.getNumericCellValue());
short format = cell.getCellStyle().getDataFormat();
SimpleDateFormat sdf = null;
if (format == 14 || format == 31 || format == 57 || format == 58
|| (176 <= format && format <= 178)
|| (182 <= format && format <= 196)
|| (210 <= format && format <= 213) || (208 == format)) {
if (format == 14) {
sdf = new SimpleDateFormat("yyyy/MM/dd");
} else {
sdf = new SimpleDateFormat("yyyy年MM月dd日");
}
double value = cell.getNumericCellValue();
Date date = org.apache.poi.ss.usermodel.DateUtil
.getJavaDate(value);
if (date == null || "".equals(date)) {
cellValue = "";
}
try {
cellValue = (sdf.format(date));
} catch (Exception e) {
e.printStackTrace();
cellValue = "";
}
} else { // 不是日期格式
cell.setCellType(Cell.CELL_TYPE_STRING);
String temp = cell.getStringCellValue();
// 判断是否包含小数点,如果不含小数点,则以字符串读取,如果含小数点,则转换为Double类型的字符串
if (temp.indexOf(".") > -1) {
cellValue = (String.valueOf(new Double(temp)).trim());
} else {
cellValue = (temp.trim());
}
}
break;
case Cell.CELL_TYPE_STRING: // 字符串
cellValue = String.valueOf(cell.getStringCellValue());
break;
case Cell.CELL_TYPE_BOOLEAN: // Boolean
cellValue = String.valueOf(cell.getBooleanCellValue());
break;
case Cell.CELL_TYPE_FORMULA: // 公式
cellValue = String.valueOf(cell.getCellFormula());
break;
case Cell.CELL_TYPE_BLANK: // 空值
cellValue = "";
break;
case Cell.CELL_TYPE_ERROR: // 故障
cellValue = "";// 非法字符
break;
default:
cellValue = "";// 未知类型
break;
}
return cellValue;
}
/**
* 获取ppt中的文本
*
* @param filePath 文件全路径
* @return 文本
* @author 明快de玄米61
* @date 2022/12/18 23:22
**/
private static String extractPpt(String filePath) {
try (
InputStream in = new FileInputStream(filePath)
) {
StringBuffer content = new StringBuffer();
SlideShow ss = new SlideShow(new HSLFSlideShow(in));
Slide[] slides = ss.getSlides();// 获得每一张幻灯片
for (int i = 0; i < slides.length; i++) {
TextRun[] t = slides[i].getTextRuns();// 为了取得幻灯片的文字内容,建立TextRun
for (int j = 0; j < t.length; j++) {
content.append(t[j].getText());// 这里会将文字内容加到content中去
}
content.append(slides[i].getTitle());
}
return content.toString();
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
/**
* 获取pptx中的文本
*
* @param filePath 文件全路径
* @return 文本
* @author 明快de玄米61
* @date 2022/12/18 23:22
**/
private static String extractPptx(String filePath) {
try (
InputStream in = new FileInputStream(filePath)
) {
XMLSlideShow slide = new XMLSlideShow(in);
XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(
slide);
return extractor.getText();
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
/**
* 获取pdf中的文本
*
* @param filePath 文件全路径
* @return 文本
* @author 明快de玄米61
* @date 2022/12/18 23:22
**/
private static String extractPdf(String filePath) {
RandomAccess rf = null;
PDDocument document = null;
try {
rf = new RandomAccessFile(new File(filePath), "r");
PDFParser parser = new PDFParser(rf);
parser.parse();
document = parser.getPDDocument();
PDFTextStripper stripper = new PDFTextStripper();
String text = stripper.getText(document);
document.close();
return text;
} catch (Exception e) {
e.printStackTrace();
} finally {
if (document != null) {
try {
document.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (rf != null) {
try {
rf.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return null;
}
/**
* 获取xml中的文本
*
* @param filePath 文件全路径
* @return 文本
* @author 明快de玄米61
* @date 2022/12/18 23:22
**/
private static String extractXml(String filePath) {
try (
InputStream in = new FileInputStream(filePath)
) {
Document doc = new SAXReader().read(in);
return doc.asXML();
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
}
2.3、结果
二、获取html中文本
1、pom
<dependency>
<groupId>htmlparser</groupId>
<artifactId>htmlparser</artifactId>
<version>1.0.0</version>
</dependency>
2、代码
public class Test {
public static void main(String[] args) {
System.out.println(toText(new File("C:\\test\\20230228\\0bd25c923b9f4dd0b08147f0d09b650a.html")));
}
public static String toText(File file) {
if (file.length() > 0) {
Parser myParser;
StringBuilder strb = new StringBuilder();
try {
String content = readText(file);
myParser = Parser.createParser(content, "utf-8");
TagNameFilter filter = new TagNameFilter("script");
NotFilter notFilter = new NotFilter();
notFilter.setPredicate(filter);
HtmlPage visitor = new HtmlPage(myParser);
myParser.visitAllNodesWith(visitor);
String textInPage = visitor.getTitle();
strb.append(textInPage + "\n");
NodeList textBody1 = visitor.getBody();
textBody1.keepAllNodesThatMatch(notFilter, true);
Node[] node1 = textBody1.toNodeArray();
for (Node n1 : node1) {
strb.append(n1.toPlainTextString().trim() + "\n");
}
} catch (ParserException e) {
}
return strb.toString();
}
return null;
}
public static String readText(File file) {
InputStream input = null;
try {
input = new FileInputStream(file);
ByteArrayOutputStream output = new ByteArrayOutputStream();
byte[] buffer = new byte[4096];
int n = 0;
while (-1 != (n = input.read(buffer))) {
output.write(buffer, 0, n);
}
output.flush();
String text = output.toString("UTF-8");
output.close();
return text;
} catch (Exception e) {
e.printStackTrace();
} finally {
if (input != null) {
try {
input.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return null;
}
}
3、结果
进入词条
清除历史记录关闭
播报
编辑
收藏
赞
登录
扫码下载百科APP
领取50财富值奖励
分享到微信朋友圈
打开微信“扫一扫”即可将网页分享至朋友圈
选择朗读音色
00:00
00:00
三、问题解决
1、java.io.IOException: Zip bomb detected!
(1)背景
通过poi获取docx文件中文本的时候出现该问题。
报错代码如下:
(2)错误
java.io.IOException: Zip bomb detected! The file would exceed the max. ratio of compressed file size to the size of the expanded data.
This may indicate that the file is used to inflate memory usage and thus could pose a security risk.
You can adjust this limit via ZipSecureFile.setMinInflateRatio() if you need to work with files which exceed this limit.
Uncompressed size: 102821, Raw/compressed size: 984, ratio: 0.009570
Limits: MIN_INFLATE_RATIO: 0.010000, Entry: word/fonts/font2.odttf
at org.apache.poi.openxml4j.util.ZipArchiveThresholdInputStream.checkThreshold(ZipArchiveThresholdInputStream.java:132)
at org.apache.poi.openxml4j.util.ZipArchiveThresholdInputStream.read(ZipArchiveThresholdInputStream.java:82)
at org.apache.poi.util.IOUtils.toByteArray(IOUtils.java:182)
at org.apache.poi.util.IOUtils.toByteArray(IOUtils.java:149)
at org.apache.poi.openxml4j.util.ZipArchiveFakeEntry.<init>(ZipArchiveFakeEntry.java:47)
at org.apache.poi.openxml4j.util.ZipInputStreamZipEntrySource.<init>(ZipInputStreamZipEntrySource.java:53)
at org.apache.poi.openxml4j.opc.ZipPackage.<init>(ZipPackage.java:106)
at org.apache.poi.openxml4j.opc.OPCPackage.open(OPCPackage.java:307)
at org.apache.poi.ooxml.util.PackageHelper.open(PackageHelper.java:47)
at org.apache.poi.xwpf.usermodel.XWPFDocument.<init>(XWPFDocument.java:142)
at com.atguigu.multistagebuildsdemo.Test1.extractDocx(Test1.java:85)
at com.atguigu.multistagebuildsdemo.Test1.main(Test1.java:32)
(3)解决办法
提升poi
相关依赖版本为4.1.2
,但是会造成extractXls、extractXlsx、extractPpt、extractXml方法缺少相关类,所以基本只剩下抽取doc、docx的方法可用了,pom.xml修改结果如下:
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>4.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>4.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml-schemas</artifactId>
<version>4.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>ooxml-schemas</artifactId>
<version>1.1</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.9</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</dependency>
<dependency>
<groupId>com.itextpdf</groupId>
<artifactId>itextpdf</artifactId>
<version>5.5.3</version>
</dependency
添加一行代码就行了,如下:
ZipSecureFile.setMinInflateRatio(0.001);
把上面代码中抽取doc
、docx
的方法中添加上面这行代码即可,添加结果如下:
/**
* 获取doc中的文本
*
* @param filePath 文件全路径
* @return 文本
* @author 明快de玄米61
* @date 2022/12/18 23:22
**/
private static String extractDoc(String filePath) {
try (
InputStream in = new FileInputStream(filePath)
) {
ZipSecureFile.setMinInflateRatio(0.001);
WordExtractor extractor = new WordExtractor(in);
return extractor.getText();
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
/**
* 获取docx中的文本
*
* @param filePath 文件全路径
* @return 文本
* @author 明快de玄米61
* @date 2022/12/18 23:22
**/
private static String extractDocx(String filePath) {
try (
InputStream in = new FileInputStream(filePath)
) {
ZipSecureFile.setMinInflateRatio(0.001);
XWPFDocument xdoc = new XWPFDocument(in);
XWPFWordExtractor extractor = new XWPFWordExtractor(xdoc);
return extractor.getText();
} catch (Exception e) {
e.printStackTrace();
}
return null;
}