读取word中的内容


/**
 * 利用POI实现从word中读取内容
 */============================================
将读取的内容写入另外一个文件中
package com.cy;
/**
 * WordReader类中readDoc的作用为从word中将数据读出
 */import com.cy.WordWriter;
import java.io.ByteArrayInputStream;
import java.io.FileOutputStream;
import java.io.IOException;import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;public class WordWriter {
 public static boolean writeDoc(String path, String content) { 
  boolean w = false; 
  try {   // byte b[] = content.getBytes("ISO-8859-1"); 
  byte b[] = content.getBytes();   ByteArrayInputStream bais = new ByteArrayInputStream(b); 
  POIFSFileSystem fs = new POIFSFileSystem(); 
  DirectoryEntry directory = fs.getRoot();   DocumentEntry de = directory.createDocument("WordDocument", bais); 
  FileOutputStream ostream = new FileOutputStream(path); 
  fs.writeFilesystem(ostream); 
  bais.close(); 
  ostream.close();   } catch (IOException e) { 
  e.printStackTrace(); 
  } 
  return w; 
  } 
  public static void main(String[] args) throws Exception{ 
  String wr=WordReader.readDoc("D://test.doc");
  boolean b = writeDoc("D://result.doc",wr); 
  } 
}
 //目前该程序只能实现对简单的文字的操作,无法实现对表格样式的操作,继续改进,请关注!! 
import java.io.File;
import java.io.FileInputStream;import org.apache.poi.hwpf.extractor.WordExtractor;
public class WordReader {
 public static String readDoc(String doc) throws Exception {     // 创建输入流读取DOC文件
     FileInputStream in = new FileInputStream(new File(doc));
     WordExtractor extractor = null;
     String text = null;
     // 创建WordExtractor
     extractor = new WordExtractor(in);
     // 对DOC文件进行提取
     text = extractor.getText();
     return text;
 }
 
 
 public static void main(String[] args) {        try{
           String text = WordReader.readDoc("d://test.doc");
           System.out.println(text);
        }catch(Exception e){
            e.printStackTrace();
        }    }
}

摘自:

 

import java.io.File;
 import java.io.FileInputStream; 
import java.io.FileNotFoundException;
 
import org.apache.poi.hwpf.HWPFDocument; 
import org.apache.poi.hwpf.usermodel.Paragraph;
 import org.apache.poi.hwpf.usermodel.Range;
 import org.apache.poi.hwpf.usermodel.Table;
 import org.apache.poi.hwpf.usermodel.TableCell;
 import org.apache.poi.hwpf.usermodel.TableIterator; 
import org.apache.poi.hwpf.usermodel.TableRow;
 
import java.io.File; 
import java.io.FileInputStream;   
import java.io.InputStream;    
 import org.apache.poi.POIXMLDocument;  
 import org.apache.poi.POIXMLTextExtractor;   
import org.apache.poi.hwpf.extractor.WordExtractor;   
import org.apache.poi.openxml4j.opc.OPCPackage;   
import org.apache.poi.xwpf.extractor.XWPFWordExtractor; 
 
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 
public class ExportDocImpl {  
public void testWord(){  
 try{      
 FileInputStream in = new FileInputStream("D:\\sinye.doc");//载入文档     
 POIFSFileSystem pfs = new POIFSFileSystem(in);         
HWPFDocument hwpf = new HWPFDocument(pfs);         
Range range = hwpf.getRange();//得到文档的读取范围       
TableIterator it = new TableIterator(range);      
//迭代文档中的表格       
while (it.hasNext()) {            
 Table tb = (Table) it.next();            
 //迭代行,默认从0开始           
for (int i = 0; i < tb.numRows(); i++) {                 
TableRow tr = tb.getRow(i);                
 //迭代列,默认从0开始              
 for (int j = 0; j < tr.numCells(); j++) {                     
TableCell td = tr.getCell(j);//取得单元格                  
 //取得单元格的内容                   
for(int k=0;k<td.numParagraphs();k++){                        
 Paragraph para =td.getParagraph(k);                        
 String s = para.text();                         
System.out.println(s);                   
} //end for                
  }   //end for         
  }   //end for       
} //end while   
}catch(Exception e){   
 e.printStackTrace();   
}  
}//end method                
public void testWord1(){            
try {               
//word 2003: 图片不会被读取              
 InputStream is = new FileInputStream(new File("D:\\sinye.doc"));                     
WordExtractor ex = new WordExtractor(is);                     
String text2003 = ex.getText();                    
 System.out.println(text2003);               
//word 2007 图片不会被读取, 表格中的数据会被放在字符串的最后               
OPCPackage opcPackage = POIXMLDocument.openPackage("D:\\sinye.doc");                     
POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);                     
String text2007 = extractor.getText();                    
 System.out.println(text2007);                          
} catch (Exception e) {                     
e.printStackTrace();          
 }    
 } 
}

 



import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.hwpf.usermodel.Table;
import org.apache.poi.hwpf.usermodel.TableCell;
import org.apache.poi.hwpf.usermodel.TableIterator;
import org.apache.poi.hwpf.usermodel.TableRow;

import java.io.File;   
import java.io.FileInputStream;   
import java.io.InputStream;   
  
import org.apache.poi.POIXMLDocument;   
import org.apache.poi.POIXMLTextExtractor;   
import org.apache.poi.hwpf.extractor.WordExtractor;   
import org.apache.poi.openxml4j.opc.OPCPackage;   
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;  


import org.apache.poi.poifs.filesystem.POIFSFileSystem;

public class ExportDocImpl
{
    public void testWord(){
        try{
            FileInputStream in = new FileInputStream("D:\\sinye.doc");//载入文档
           POIFSFileSystem pfs = new POIFSFileSystem(in);   
            HWPFDocument hwpf = new HWPFDocument(pfs);   
            Range range = hwpf.getRange();//得到文档的读取范围
            TableIterator it = new TableIterator(range);
           //迭代文档中的表格
            while (it.hasNext()) {   
                Table tb = (Table) it.next();   
                //迭代行,默认从0开始
                for (int i = 0; i < tb.numRows(); i++) {   
                    TableRow tr = tb.getRow(i);   
                    //迭代列,默认从0开始
                    for (int j = 0; j < tr.numCells(); j++) {   
                        TableCell td = tr.getCell(j);//取得单元格
                        //取得单元格的内容
                        for(int k=0;k<td.numParagraphs();k++){   
                            Paragraph para =td.getParagraph(k);   
                            String s = para.text();   
                            System.out.println(s);
                        } //end for    
                    }   //end for
                }   //end for
            } //end while
        }catch(Exception e){
            e.printStackTrace();
        }
    }//end method
    
    
           public void testWord1(){
           try {   
            //word 2003: 图片不会被读取   
            InputStream is = new FileInputStream(new File("D:\\sinye.doc"));   
                  WordExtractor ex = new WordExtractor(is);   
                  String text2003 = ex.getText();   
                  System.out.println(text2003);   
            //word 2007 图片不会被读取, 表格中的数据会被放在字符串的最后   
            OPCPackage opcPackage = POIXMLDocument.openPackage("D:\\sinye.doc");   
                  POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);   
                  String text2007 = extractor.getText();   
                  System.out.println(text2007);   
               
        } catch (Exception e) {   
                  e.printStackTrace();   
        } 
    }
}



 

 

 



/**
 * 利用POI实现从word中读取内容
 */============================================
将读取的内容写入另外一个文件中
package com.cy;
/**
 * WordReader类中readDoc的作用为从word中将数据读出
 */import com.cy.WordWriter;
import java.io.ByteArrayInputStream;
import java.io.FileOutputStream;
import java.io.IOException;import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;public class WordWriter {
 public static boolean writeDoc(String path, String content) { 
  boolean w = false; 
  try {   // byte b[] = content.getBytes("ISO-8859-1"); 
  byte b[] = content.getBytes();   ByteArrayInputStream bais = new ByteArrayInputStream(b); 
  POIFSFileSystem fs = new POIFSFileSystem(); 
  DirectoryEntry directory = fs.getRoot();   DocumentEntry de = directory.createDocument("WordDocument", bais); 
  FileOutputStream ostream = new FileOutputStream(path); 
  fs.writeFilesystem(ostream); 
  bais.close(); 
  ostream.close();   } catch (IOException e) { 
  e.printStackTrace(); 
  } 
  return w; 
  } 
  public static void main(String[] args) throws Exception{ 
  String wr=WordReader.readDoc("D://test.doc");
  boolean b = writeDoc("D://result.doc",wr); 
  } 
}
 //目前该程序只能实现对简单的文字的操作,无法实现对表格样式的操作,继续改进,请关注!! 
import java.io.File;
import java.io.FileInputStream;import org.apache.poi.hwpf.extractor.WordExtractor;
public class WordReader {
 public static String readDoc(String doc) throws Exception {     // 创建输入流读取DOC文件
     FileInputStream in = new FileInputStream(new File(doc));
     WordExtractor extractor = null;
     String text = null;
     // 创建WordExtractor
     extractor = new WordExtractor(in);
     // 对DOC文件进行提取
     text = extractor.getText();
     return text;
 }
 
 
 public static void main(String[] args) {        try{
           String text = WordReader.readDoc("d://test.doc");
           System.out.println(text);
        }catch(Exception e){
            e.printStackTrace();
        }    }
}

摘自:

 

import java.io.File;
 import java.io.FileInputStream; 
import java.io.FileNotFoundException;
 
import org.apache.poi.hwpf.HWPFDocument; 
import org.apache.poi.hwpf.usermodel.Paragraph;
 import org.apache.poi.hwpf.usermodel.Range;
 import org.apache.poi.hwpf.usermodel.Table;
 import org.apache.poi.hwpf.usermodel.TableCell;
 import org.apache.poi.hwpf.usermodel.TableIterator; 
import org.apache.poi.hwpf.usermodel.TableRow;
 
import java.io.File; 
import java.io.FileInputStream;   
import java.io.InputStream;    
 import org.apache.poi.POIXMLDocument;  
 import org.apache.poi.POIXMLTextExtractor;   
import org.apache.poi.hwpf.extractor.WordExtractor;   
import org.apache.poi.openxml4j.opc.OPCPackage;   
import org.apache.poi.xwpf.extractor.XWPFWordExtractor; 
 
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 
public class ExportDocImpl {  
public void testWord(){  
 try{      
 FileInputStream in = new FileInputStream("D:\\sinye.doc");//载入文档     
 POIFSFileSystem pfs = new POIFSFileSystem(in);         
HWPFDocument hwpf = new HWPFDocument(pfs);         
Range range = hwpf.getRange();//得到文档的读取范围       
TableIterator it = new TableIterator(range);      
//迭代文档中的表格       
while (it.hasNext()) {            
 Table tb = (Table) it.next();            
 //迭代行,默认从0开始           
for (int i = 0; i < tb.numRows(); i++) {                 
TableRow tr = tb.getRow(i);                
 //迭代列,默认从0开始              
 for (int j = 0; j < tr.numCells(); j++) {                     
TableCell td = tr.getCell(j);//取得单元格                  
 //取得单元格的内容                   
for(int k=0;k<td.numParagraphs();k++){                        
 Paragraph para =td.getParagraph(k);                        
 String s = para.text();                         
System.out.println(s);                   
} //end for                
  }   //end for         
  }   //end for       
} //end while   
}catch(Exception e){   
 e.printStackTrace();   
}  
}//end method                
public void testWord1(){            
try {               
//word 2003: 图片不会被读取              
 InputStream is = new FileInputStream(new File("D:\\sinye.doc"));                     
WordExtractor ex = new WordExtractor(is);                     
String text2003 = ex.getText();                    
 System.out.println(text2003);               
//word 2007 图片不会被读取, 表格中的数据会被放在字符串的最后               
OPCPackage opcPackage = POIXMLDocument.openPackage("D:\\sinye.doc");                     
POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);                     
String text2007 = extractor.getText();                    
 System.out.println(text2007);                          
} catch (Exception e) {                     
e.printStackTrace();          
 }    
 } 
}



import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.hwpf.usermodel.Table;
import org.apache.poi.hwpf.usermodel.TableCell;
import org.apache.poi.hwpf.usermodel.TableIterator;
import org.apache.poi.hwpf.usermodel.TableRow;

import java.io.File;   
import java.io.FileInputStream;   
import java.io.InputStream;   
  
import org.apache.poi.POIXMLDocument;   
import org.apache.poi.POIXMLTextExtractor;   
import org.apache.poi.hwpf.extractor.WordExtractor;   
import org.apache.poi.openxml4j.opc.OPCPackage;   
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;  


import org.apache.poi.poifs.filesystem.POIFSFileSystem;

public class ExportDocImpl
{
    public void testWord(){
        try{
            FileInputStream in = new FileInputStream("D:\\sinye.doc");//载入文档
           POIFSFileSystem pfs = new POIFSFileSystem(in);   
            HWPFDocument hwpf = new HWPFDocument(pfs);   
            Range range = hwpf.getRange();//得到文档的读取范围
            TableIterator it = new TableIterator(range);
           //迭代文档中的表格
            while (it.hasNext()) {   
                Table tb = (Table) it.next();   
                //迭代行,默认从0开始
                for (int i = 0; i < tb.numRows(); i++) {   
                    TableRow tr = tb.getRow(i);   
                    //迭代列,默认从0开始
                    for (int j = 0; j < tr.numCells(); j++) {   
                        TableCell td = tr.getCell(j);//取得单元格
                        //取得单元格的内容
                        for(int k=0;k<td.numParagraphs();k++){   
                            Paragraph para =td.getParagraph(k);   
                            String s = para.text();   
                            System.out.println(s);
                        } //end for    
                    }   //end for
                }   //end for
            } //end while
        }catch(Exception e){
            e.printStackTrace();
        }
    }//end method
    
    
           public void testWord1(){
           try {   
            //word 2003: 图片不会被读取   
            InputStream is = new FileInputStream(new File("D:\\sinye.doc"));   
                  WordExtractor ex = new WordExtractor(is);   
                  String text2003 = ex.getText();   
                  System.out.println(text2003);   
            //word 2007 图片不会被读取, 表格中的数据会被放在字符串的最后   
            OPCPackage opcPackage = POIXMLDocument.openPackage("D:\\sinye.doc");   
                  POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);   
                  String text2007 = extractor.getText();   
                  System.out.println(text2007);   
               
        } catch (Exception e) {   
                  e.printStackTrace();   
        } 
    }
}