java 读取PDF文件内容进行替换
需要使用到的包
监听类(对需要替换的内容关键词进行匹配)
实体类(保存关键字字体格式信息以及其位置)
工具类(对关键字进行替换)
测试类
需要使用到的包
<dependency>
<groupId>com.itextpdf</groupId>
<artifactId>itextpdf</artifactId>
<version>5.5.13.3</version>
</dependency>
<dependency>
<groupId>com.itextpdf</groupId>
<artifactId>itext-asian</artifactId>
<version>5.2.0</version>
</dependency>
监听类(对需要替换的内容关键词进行匹配)
public class KeyWordPositionListener implements RenderListener { //存放匹配上的字符信息
private List<MatchItem> matches = new ArrayList<MatchItem>();
//存放所有的字符信息
private List<MatchItem> allItems = new ArrayList<MatchItem>(); private Rectangle curPageSize;
/**
* 匹配的关键字
*/
private String keyword;
/**
* 匹配的当前页
*/
private Integer pageNumber; @Override
public void beginTextBlock() {
//do nothing
} @Override
public void renderText(TextRenderInfo renderInfo) {
//获取字符
String content = renderInfo.getText();
Rectangle2D.Float textRectangle = renderInfo.getDescentLine().getBoundingRectange(); MatchItem item = new MatchItem();
item.setContent(content);
item.setPageNum(pageNumber);
item.setFontHeight(textRectangle.height == 0 ? 12:textRectangle.height);//默认12
item.setFontWidth(textRectangle.width);
item.setPageHeight(curPageSize.getHeight());
item.setPageWidth(curPageSize.getWidth());
item.setX((float)textRectangle.getX());
item.setY((float)textRectangle.getY()); //若keyword是单个字符,匹配上的情况
if(content.equalsIgnoreCase(keyword)) {
matches.add(item);
}
//保存所有的项
allItems.add(item);
} @Override
public void endTextBlock() {
//do nothing
} @Override
public void renderImage(ImageRenderInfo renderInfo) {
//do nothing
} /**
* 设置需要匹配的当前页
* @param pageNumber
*/
public void setPageNumber(Integer pageNumber) {
this.pageNumber = pageNumber;
} /**
* 设置需要匹配的关键字,忽略大小写
* @param keyword
*/
public void setKeyword(String keyword) {
this.keyword = keyword;
} /**
* 返回匹配的结果列表
* @return
*/
public List<MatchItem> getMatches() {
return matches;
} public void setCurPageSize(Rectangle rect) {
this.curPageSize = rect;
} public List<MatchItem> getAllItems() {
return allItems;
} public void setAllItems(List<MatchItem> allItems) {
this.allItems = allItems;
}}
实体类(保存关键字字体格式信息以及其位置)
public class MatchItem { //页数
private Integer pageNum;
//x坐标
private Float x;
//y坐标
private Float y;
//页宽
private Float pageWidth;
//页高
private Float pageHeight;
//匹配字符
private String content;
//字体宽
private float fontWidth;
//字体高
private float fontHeight = 12; public Integer getPageNum() {
return pageNum;
} public void setPageNum(Integer pageNum) {
this.pageNum = pageNum;
} public Float getX() {
return x;
} public void setX(Float x) {
this.x = x;
} public Float getY() {
return y;
} public void setY(Float y) {
this.y = y;
} public Float getPageWidth() {
return pageWidth;
} public void setPageWidth(Float pageWidth) {
this.pageWidth = pageWidth;
} public Float getPageHeight() {
return pageHeight;
} public void setPageHeight(Float pageHeight) {
this.pageHeight = pageHeight;
} public String getContent() {
return content;
} public void setContent(String content) {
this.content = content;
} public float getFontWidth() {
return fontWidth;
} public void setFontWidth(float fontWidth) {
this.fontWidth = fontWidth;
} public float getFontHeight() {
return fontHeight;
} public void setFontHeight(float fontHeight) {
this.fontHeight = fontHeight;
} @Override
public String toString() {
return "MatchItem{" +
"pageNum=" + pageNum +
", x=" + x +
", y=" + y +
", pageWidth=" + pageWidth +
", pageHeight=" + pageHeight +
", content='" + content + '\'' +
'}';
}
}
工具类(对关键字进行替换)
public class PdfUtil {
/**
* 根据关键字和pdf文件字节,全文搜索关键字
* @param bytes pdf字节
* @param keyword 关键字
* @return
* @throws Exception
*/
private static List<MatchItem> matchAll(byte[] bytes, String keyword) throws Exception {
List<MatchItem> items = new ArrayList<>();
PdfReader reader = new PdfReader(bytes);
//获取pdf页数
int pageSize = reader.getNumberOfPages();
//逐页匹配关键字
for(int page = 1;page <= pageSize;page++){
items.addAll(matchPage(reader,page,keyword));
}
return items;
} /**
* 根据关键字、文档路径、pdf页数寻找特定的文件内容
* @param reader
* @param pageNumber 页数
* @param keyword 关键字
* @return
* @throws Exception
*/
private static List<MatchItem> matchPage(PdfReader reader, Integer pageNumber,String keyword) throws Exception {
PdfReaderContentParser parse = new PdfReaderContentParser(reader);
Rectangle rectangle = reader.getPageSize(pageNumber);
//匹配监听
KeyWordPositionListener renderListener = new KeyWordPositionListener();
renderListener.setKeyword(keyword);
renderListener.setPageNumber(pageNumber);
renderListener.setCurPageSize(rectangle);
parse.processContent(pageNumber, renderListener);
return findKeywordItems(renderListener,keyword);
} /**
* 找到匹配的关键词块
* @param renderListener
* @param keyword
* @return
*/
private static List<MatchItem> findKeywordItems(KeyWordPositionListener renderListener,String keyword){
//先判断本页中是否存在关键词
List<MatchItem> allItems = renderListener.getAllItems();//所有块LIST
StringBuffer sbtemp = new StringBuffer(""); for(MatchItem item : allItems){//将一页中所有的块内容连接起来组成一个字符串。
sbtemp.append(item.getContent());
} List<MatchItem> matches = renderListener.getMatches();
//一页组成的字符串没有关键词,直接return
//第一种情况:关键词与块内容完全匹配的项,直接返回
if(sbtemp.toString().indexOf(keyword) == -1 || matches.size() > 0){
return matches;
}
//第二种情况:多个块内容拼成一个关键词,则一个一个来匹配,组装成一个关键词
sbtemp = new StringBuffer("");
List<MatchItem> tempItems = new ArrayList();
for(MatchItem item : allItems){
if(keyword.indexOf(item.getContent()) != -1 ){
tempItems.add(item);
sbtemp.append(item.getContent()); if(keyword.indexOf(sbtemp.toString()) == -1){//如果暂存的字符串和关键词 不再匹配时
sbtemp = new StringBuffer(item.getContent());
tempItems.clear();
tempItems.add(item);
} if(sbtemp.toString().equalsIgnoreCase(keyword)){//暂存的字符串正好匹配到关键词时
matches.add(tempItems.get(0));//得到匹配的项
sbtemp = new StringBuffer("");//清空暂存的字符串
tempItems.clear();//清空暂存的LIST
continue;//继续查找
}
}else{//如果找不到则清空
sbtemp = new StringBuffer("");
tempItems.clear();
}
}
return matches;
} /**
* 替换目标文字,生成新的pdf文件
* @param bytes 目标pdf
* @param outputStream
* @throws Exception
*/
private static void manipulatePdf(byte[] bytes,OutputStream outputStream,List<MatchItem> matchItems,String keyWord,String keyWordNew) throws Exception{
PdfReader reader = new PdfReader(bytes);
PdfStamper stamper = new PdfStamper(reader, outputStream);
PdfContentByte canvas;
Map<Integer,List<MatchItem>> mapItem = new HashMap<>();
List<MatchItem> itemList;
for(MatchItem item : matchItems){
Integer pageNum = item.getPageNum();
if(mapItem.containsKey(pageNum)){
itemList = mapItem.get(pageNum);
itemList.add(item);
mapItem.put(pageNum,itemList);
}else{
itemList = new ArrayList<>();
itemList.add(item);
mapItem.put(pageNum,itemList);
}
}
//遍历每一页去修改
for(Integer page : mapItem.keySet()){
List<MatchItem> items = mapItem.get(page);
//遍历每一页中的匹配项
for(MatchItem item : items){
canvas = stamper.getOverContent(page);
float x = item.getX();
float y = item.getY();
float fontWidth = item.getFontWidth();
float fontHeight = item.getFontHeight();
canvas.saveState();
canvas.setColorFill(BaseColor.WHITE);
canvas.rectangle(x, y,fontWidth*keyWord.length(),fontWidth+2);
canvas.fill();
canvas.restoreState();
//开始写入文本
canvas.beginText();
BaseFont bf = BaseFont.createFont("STSong-Light", "UniGB-UCS2-H", BaseFont.EMBEDDED);
Font font = new Font(bf,fontWidth,Font.BOLD);
//设置字体和大小
canvas.setFontAndSize(font.getBaseFont(), fontWidth);
//设置字体的输出位置
canvas.setTextMatrix(x, y+fontWidth/10+0.5f);
//要输出的text
canvas.showText(keyWordNew); canvas.endText();
}
}
stamper.close();
reader.close();
} /**
* 替换pdf中指定文字
* @param srcBytes 目标pdf
* @param outputStream 新pdf
* @param keyWord 替换的文字
* @param keyWordNew 替换后的文字
* @throws Exception
*/
public static void pdfReplace(byte[] srcBytes,OutputStream outputStream,String keyWord,String keyWordNew) throws Exception{
manipulatePdf(srcBytes,outputStream,matchAll(srcBytes,keyWord),keyWord,keyWordNew);
}
}
测试类
public class test{
public static void main(String[] args) {
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
FileInputStream inputStream = null;
FileOutputStream fileOutputStream = null;
try {
//源文件pdf
File file = new File("D:\\test.pdf");
//目标文件
File destFile = new File("D:\\dest.pdf");
inputStream = new FileInputStream(file);
fileOutputStream = new FileOutputStream(destFile);
byte[] bytes = new byte[inputStream.available()];
inputStream.read(bytes);
//关键字
String keyWord = "请选择";
//替换后的内容
String keyWordNew = "你爸爸";
PdfUtil.pdfReplace(bytes,outputStream,keyWord,keyWordNew);
//得到替换后的文件字节
byte[] byteArray = outputStream.toByteArray();
//输出
fileOutputStream.write(byteArray);
} catch (Exception e) {
throw new RuntimeException(e);
} finally {
try {
if (fileOutputStream != null) fileOutputStream.close();
if (inputStream != null) inputStream.close();
outputStream.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
}