利用pdfbox和poi抽取pdf、doc以及docx格式的内容

使用pdfbox1.5.0抽取pdf格式文档内容，使用poi3.7抽取doc及docx文档内容：
 /**

  * Created by yan.shi on 2017/9/25.

  */

 import org.apache.pdfbox.pdfparser.PDFParser;

 import org.apache.pdfbox.pdmodel.PDDocument;

 import org.apache.pdfbox.util.PDFTextStripper;

 import org.apache.poi.POIXMLDocument;

 import org.apache.poi.POIXMLTextExtractor;

 import org.apache.poi.hwpf.extractor.WordExtractor;

 import org.apache.poi.openxml4j.exceptions.OpenXML4JException;

 import org.apache.poi.openxml4j.opc.OPCPackage;

 import org.apache.poi.xwpf.extractor.XWPFWordExtractor;

 import org.apache.xmlbeans.XmlException;

 import java.io.File;

 import java.io.FileInputStream;

 import java.io.IOException;

 /**

  * 这里使用pdfbox解析pdf类型文档

  * 使用poi解析doc与docx类型文档

  */

 public class ExtractText {

     public static void main(String[] args) {

         ExtractText text=new ExtractText();

         String filePath="文件";

         String content=text.getText(filePath);

         if(null!=content)

             System.out.println("content: "+content);

     }

     public ExtractText(){

     }

     public ExtractText(String filePath){

     }

     /**

      * 根据不同的文档类型读取，这里只使用pdf、doc、docs类型

      * @param filePath

      * @return

      */

     public String getText(String filePath){

         File file = new File(filePath);

         String fileName=file.getName();

         String postfix=fileName.substring(fileName.lastIndexOf(".")+1);

         String content=null;

         if(postfix.equalsIgnoreCase("pdf")){

             content=getPDFText(file);

         }else if(postfix.equalsIgnoreCase("doc")){

             content=getDocText(file);

         }else if(postfix.equalsIgnoreCase("docx")){

             content=getDocxText(filePath);

         }else {

             System.out.println("输入的文件格式不支持！");

             return null;

         }

         if(null!=content && !"".equals(content))

             return content;

         else

             return null;

     }

     /**

      * 利用pdfbox解析pdf内容

      * @param file

      * @return

      */

     private String getPDFText(File file){

         FileInputStream fileinput=null;

         String text=null;

         try {

             fileinput=new FileInputStream(file);

             PDFParser parser=new PDFParser(fileinput);//pdf解析器

             parser.parse();//解析

             PDDocument pdfdocument=parser.getPDDocument();//pdf文档

             PDFTextStripper stripper=new PDFTextStripper();//文本剥离

             //List allPages=pdfdocument.getDocumentCatalog().getAllPages();

             text=stripper.getText(pdfdocument);//从pdf文档剥离文本

         } catch (IOException e) {

             e.printStackTrace();

         }finally {

             if(fileinput!=null){

                 try {

                     fileinput.close();

                 } catch (IOException e) {

                     e.printStackTrace();

                 }

             }

         }

         return text;

     }

     /**

      * 读取doc文档类型

      * @param file

      * @return

      */

     private String getDocText(File file){

         FileInputStream fileinput=null;

         String text=null;

         try {

             fileinput=new FileInputStream(file);

             WordExtractor we=new WordExtractor(fileinput);

             //text=we.getText();

             String s[]=we.getParagraphText();

             for(String str:s){

                 str=str.trim();

                 if(str.equals("") || str==null)

                     continue;

                 //System.out.println(str);

             }

         } catch (IOException e) {

             e.printStackTrace();

         }finally {

             if(fileinput!=null){

                 try {

                     fileinput.close();

                 } catch (IOException e) {

                     e.printStackTrace();

                 }

             }

         }

         return text;

     }

     /**

      * 读取docx文档类型

      * @param file

      * @return

      */

     private String getDocxText(String file){

         String text=null;

         try {

             OPCPackage opcPackage=POIXMLDocument.openPackage(file);

             POIXMLTextExtractor extractor=new XWPFWordExtractor(opcPackage);

             text=extractor.getText();

            //InputStream is=new FileInputStream(file);

             //XWPFWordExtractor doc=new XWPFWordExtractor(OPCPackage.open(is));

             //List<XWPFParagraph> paras=doc.get

             //System.out.println(text);

         } catch (IOException e) {

             e.printStackTrace();

         } catch (XmlException e) {

             e.printStackTrace();

         } catch (OpenXML4JException e) {

             e.printStackTrace();

         }

         return text;

     }

 }