word纯文本导入,我用的是poi
//其中的实体,可以不用看,只看两种word(doc,docx)数据封装就可以
public static List<Question> parseWord(String in) throws Exception{
List<String> list=new ArrayList<>();
if(iswordtype(in)==1){
InputStream is = new FileInputStream(in);
WordExtractor extractor = new WordExtractor(is);
String paraTexts2[]=extractor.getCommentsText();
String paraTexts3[]=extractor.getMainTextboxText();
String paraTexts[] = extractor.getParagraphText();
for (int i=0; i<paraTexts.length; i++) {
if(StringUtils.isNotBlank(StringUtils.trim(paraTexts[i]))){
list.add(StringUtils.trim(paraTexts[i]));
}
}
//获取文件属性
//printInfo(extractor.getSummaryInformation());
close(is);
}else if(iswordtype(in)==2){
OPCPackage oPCPackage = POIXMLDocument.openPackage(in);
XWPFDocument doc = new XWPFDocument(oPCPackage);
POIXMLTextExtractor ex = new XWPFWordExtractor(doc);
POIXMLProperties.CoreProperties coreProps = ex.getCoreProperties();
//printCoreProperties(coreProps);
List<XWPFParagraph> paras = doc.getParagraphs();
for (XWPFParagraph para : paras) {
//当前段落的属性
//CTPPr pr = para.getCTP().getPPr();
//System.out.println(para.getText());
if(StringUtils.isNotBlank(StringUtils.trim(para.getText()))){
list.add(StringUtils.trim(para.getText()));
}
}
}else{
System.out.println("格式不对");
return null;
}
List<Question> newlist=null;
Question question = null;
Question.QuestionOption questionOptionDto=null;
List<Question.QuestionOption> qolist=null;
StringBuilder questionOption=null;
if(list!=null && list.size()>0){
newlist=new ArrayList<>();
question=new Question();
questionOption=new StringBuilder();
qolist=new ArrayList<>();
questionOptionDto=new Question.QuestionOption();
int type=0;
for (int i = 0; i < list.size(); i++) {
if(list.get(i).contains("、判断")||list.get(i).contains("、是非")||list.get(i).contains("、对错")){
type=3;
question.setQuestiontype(type);
question.setStem(list.get(i));
System.out.println("\n"+list.get(i));
continue;
}else if(list.get(i).contains("、单选")||list.get(i).contains("、单项")){
type=1;
question.setQuestiontype(type);
question.setStem(list.get(i));
System.out.println("\n"+list.get(i));
continue;
}else if(list.get(i).contains("、多选")){
type=2;
question.setQuestiontype(type);
question.setStem(list.get(i));
System.out.println("\n"+list.get(i));
continue;
}else if(list.get(i).startsWith("、案例") || list.get(i).startsWith("、病案")){
type=5;
question.setQuestiontype(type);
question.setStem(list.get(i));
System.out.println("\n"+list.get(i));
continue;
}else if(list.get(i).contains("、主观")){
type=4;
question.setQuestiontype(type);
question.setStem(list.get(i));
System.out.println("\n"+list.get(i));
continue;
}else{
if(isTitele(StringUtils.trim(list.get(i)))){
question.setStem(list.get(i));
//questionOption.append("[{\"optionCont\":");
System.out.println("类型: "+type);
System.out.println(list.get(i));
}else{
if(type!=0){
System.out.println(list.get(i));
//String[] option=list.get(i).split("^([a-zA-Z]+[-\\:].*)");
if(StringUtils.trim(list.get(i)).startsWith("答案:")){
questionOptionDto.setOptionCont(list.get(i));
qolist.add(questionOptionDto);
question.setQuestionOptionList(qolist);
qolist=new ArrayList<>();
questionOptionDto=new Question.QuestionOption();
newlist.add(question);
question=new Question();
//questionOption.append("}]");
//questionOption = new StringBuilder(String.format(questionOption.toString(), list.get(i).split("答案:")[1]));
//question.setQuestionOptions(questionOption.toString());
//newlist.add(question);
//question=new Question();
//questionOption = new StringBuilder();
}else{
// questionOption.append(list.get(i)+"\n");
questionOptionDto.setOptionCont(list.get(i));
qolist.add(questionOptionDto);
questionOptionDto=new Question.QuestionOption();
}
}else{
System.out.println(list.get(i));
}
}
}
}
}
return newlist;
}
/**
* 判断文档类型
* @param path
*/
public static int iswordtype(String path){
int result;
if(path.endsWith(".doc")){
result=1;
}else if(path.endsWith(".docx")){
result=2;
}else{
result=3;
}
return result;
}
/**
* 关闭输入流
* @param is
*/
public static void close(InputStream is) {
if (is != null) {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* 输出CoreProperties信息
* @param coreProps
*/
private static void printCoreProperties(POIXMLProperties.CoreProperties coreProps) {
System.out.println("开始信息----------------");
System.out.println(coreProps.getCategory()); //分类
System.out.println(coreProps.getCreator()); //创建者
System.out.println(coreProps.getCreated()); //创建时间
System.out.println(coreProps.getTitle()); //标题
System.out.println("结束信息----------------");
}
/**
* 输出SummaryInfomation
* @param info
*/
private static void printInfo(SummaryInformation info) {
//作者
System.out.println(info.getAuthor());
//字符统计
System.out.println(info.getCharCount());
//页数
System.out.println(info.getPageCount());
//标题
System.out.println(info.getTitle());
//主题
System.out.println(info.getSubject());
}
/**
* 输出DocumentSummaryInfomation
* @param info
*/
private static void printInfo(DocumentSummaryInformation info) {
//分类
System.out.println(info.getCategory());
//公司
System.out.println(info.getCompany());
}
//判断Str是否存在小标题号
public static boolean isTitele(String str){
Pattern pattern = Pattern.compile("^([\\d]+[-\\、|\\.].*)");
return pattern.matcher(str).matches();
}
//判断Str是否是 数字
public static boolean isNumeric(String str){
Pattern pattern = Pattern.compile("[0-9]*");
return pattern.matcher(str).matches();
}
//判断Str是否是选择题选择项
public static boolean isSelecteTitele(String str){
Pattern pattern = Pattern.compile("^([a-zA-Z]+[-\\:].*)");
return pattern.matcher(str).matches();
}
//判断Str是否是大标题
public static boolean isBigTilete(String str){
boolean iso= false ;
if(str.contains("一、")){
iso=true;
}else if(str.contains("二、")){
iso=true;
}else if(str.contains("三、")){
iso=true;
}else if(str.contains("四、")){
iso=true;
}else if(str.contains("五、")){
iso=true;
}else if(str.contains("六、")){
iso=true;
}else if(str.contains("七、")){
iso=true;
}else if(str.contains("八、")){
iso=true;
}
return iso;
}