前言:项目中,遇到需要将 word 文档为 html 文档的需求,下面整理一下用到的两种方式:


/**
* word转html文本
* @param doc 需要转换的doc文件对象
* @return
*/
public String parseWord2Html(Document doc){
HtmlSaveOptions saveOptions = new HtmlSaveOptions();
saveOptions.setExportHeadersFootersMode(ExportHeadersFootersMode.NONE); // HtmlSaveOptions的其他设置信息请参考相关API
ByteArrayOutputStream htmlStream = new ByteArrayOutputStream();
String htmlText = "";
try {
doc.save(htmlStream, saveOptions);
htmlText = new String(htmlStream.toByteArray(),"UTF-8");
htmlStream.close();
} catch (Exception e) {
LOG.error("word文件转换失败,详细错误信息:{}",e.getMessage());
}
return htmlText;
}
  • 方案二
    使用 apache 的 poi 进行转换。
package test.poi;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.util.List;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.xwpf.converter.core.BasicURIResolver;
import org.apache.poi.xwpf.converter.core.FileImageExtractor;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.w3c.dom.Document;

public class test {

public static synchronized String getOnlyId(){
long curL = System.currentTimeMillis();
if(curL>curL){
cacheInt = 0;
}else{
cacheInt += 1;
}
return String.valueOf(curL)+String.valueOf(cacheInt);
}

private static int cacheInt = 0;

private static String projectUrl = "http://192.168.1.1:8080/test/";

private static String projectPath = "D:/test/piccache/";

/**
* 处理word2003
* @param inFile
* @return
*/
public static String doWord(File inFile) {
ByteArrayOutputStream out = new ByteArrayOutputStream();
String randomName = "PIC"+getOnlyId();
//转换后html中图片src的链接
final String baseUrl = projectUrl+"wordpic/"+randomName+"/";
//转换后图片存放的位置
String dir = projectPath+"/wordpic/"+randomName+"/";
File dirF = new File(dir);
if(!dirF.exists()||!dirF.isDirectory()){
dirF.mkdir();
}
try{
HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(inFile));
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
wordToHtmlConverter.setPicturesManager( new PicturesManager()
{
public String savePicture( byte[] content,
PictureType pictureType, String suggestedName,
float widthInches, float heightInches )
{
return baseUrl+suggestedName;
}
} );
wordToHtmlConverter.processDocument(wordDocument);
List<Picture> pics=wordDocument.getPicturesTable().getAllPictures();
if(pics!=null){
for(int i=0;i<pics.size();i++){
Picture pic = (Picture)pics.get(i);
try {
pic.writeImageContent(new FileOutputStream(dir + pic.suggestFullFileName()));
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
}
Document htmlDocument = wordToHtmlConverter.getDocument();

DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(out);

TransformerFactory tf = TransformerFactory.newInstance();
Transformer serializer = tf.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "GB2312");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, streamResult);
out.close();
}catch(Exception e){
e.printStackTrace();
}
return new String(out.toByteArray());
}

/**
* 处理word2007
* @param inFile
* @return
*/
public static String doWord2007(File inFile) {
ByteArrayOutputStream out = new ByteArrayOutputStream();
String randomName = "PIC"+getOnlyId();
//转换后html中图片src的链接
final String baseUrl = projectUrl+"wordpic/"+randomName+"/";
//转换后图片存放的位置
String dir = projectPath+"/wordpic/"+randomName+"/";
File dirF = new File(dir);
if(!dirF.exists()||!dirF.isDirectory()){
dirF.mkdir();
}
try{
XWPFDocument wordDocument = new XWPFDocument(new FileInputStream(inFile));
XHTMLOptions options = XHTMLOptions.create().URIResolver(new BasicURIResolver(baseUrl));
File imageFolderFile = new File(dir);
options.setExtractor(new FileImageExtractor(imageFolderFile));
XHTMLConverter.getInstance().convert(wordDocument, out, options);
out.close();
}catch(Exception e){
e.printStackTrace();
}
return new String(out.toByteArray());
}


public static void main(String[] args) {
File word2003 = new File("e:/文书系统2.0版本设计.doc");
File word2007 = new File("e:/test.docx");

/** 打印出word2003转换后的html内容*/
System.out.println(doWord(word2003));

/** 打印出word2007转换后的html内容*/
System.out.println(doWord2007(word2007));
}

}

用到的jar包:

word文档转为html文档解决方案_转html

完整源码及 jar 包: