如何利用pdfbox将pdf解析为txt-蒲公英云

如何利用pdfbox将pdf解析为txt

利用apache的pdfbox将pdf解析为txt文件，需要的最基本包如下：

pdfbox-0.7.3.jar

fontbox-2.0.7.jar

commons-logging-1.2.jar

点我下载：点击打开下载链接

下面是以d盘pdf目录下的pdf文件为例：

package com;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.sql.SQLException;
import org.apache.pdfbox.io.RandomAccessBuffer;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
public class CsrPdfRead {
    /**
     *读取文本文件解析文本
     * @param args
     * @throws InterruptedException 
     * @throws SQLException 
     */
    public static void main(String[] args) throws InterruptedException, SQLException {
        String path="d:/pdf";
        String filepath="";
          File file=new File(path);
          File[] tempList = file.listFiles();
          System.out.println("该目录下对象个数："+tempList.length);
          for (int i = 0; i < tempList.length; i++) {
           if (tempList[i].isFile()) {
                filepath=path+"/"+tempList[i].getName();
                System.out.println(path+"/"+filepath);
                if(!filepath.toUpperCase().endsWith(".TXT")){
                    System.out.println(getOrderText(filepath));
                }
           }
           if (tempList[i].isDirectory()) {
            System.out.println("文件夹："+tempList[i]);
           }
          }
    }
    public static String getOrderText(String filepath){
        String filetxt="";
         try {
             String encoding="GBK";
           String newfilepath=  getTextFromPDF(filepath);
             if(newfilepath==null){
                 return "";
             }
             File file=new File(newfilepath);
             if(file.isFile() && file.exists()){ //判断文件是否存在
                 InputStreamReader read = new InputStreamReader(
                 new FileInputStream(file),encoding);//考虑到编码格式
                 BufferedReader bufferedReader = new BufferedReader(read);
                 String lineTxt = null;
                 int a = 0;
                 int b = 0;
                 String bb = "";
                 while((lineTxt = bufferedReader.readLine()) != null){
                    //System.out.println(lineTxt);
                     if(lineTxt==null || lineTxt.equals("")||lineTxt.trim().length()==0){
                         continue;
                     }
                     if(lineTxt.contains("Order ID")){
                         String[] oid = lineTxt.split("\\u0029\\s*P");
                         if(oid.length>0){
                             bb+=oid[0].substring(oid[0].lastIndexOf("(")+10);
                             b++;
                         } 
                     }
                     if(lineTxt.contains("-001")){
                         if(lineTxt.lastIndexOf("-001")-11>=0){
                         bb+=" "+lineTxt.substring(lineTxt.lastIndexOf("-001")-11,lineTxt.lastIndexOf("-001")+4);
                         b++;
                         }
                     }
                     if(lineTxt.contains("end of line")){
                        // System.out.println(++a);
                         if(b==2){
                             System.out.println(bb);
                             //filetxt+=++a+":"+bb+"/r/n";
                             bb="";
                             b=0;
                         }
                     }
                 }
                 read.close();
     }else{
         System.out.println("找不到指定的文件");
     }
     } catch (Exception e) {
         System.out.println("读取文件内容出错");
         e.printStackTrace();
     }
     return filetxt;
    }
     public  static String getTextFromPDF(String pdfFilePath) {  
            String result = null;  
            FileInputStream is = null;  
            PDDocument document = null;  
            try {  
                if(pdfFilePath.toUpperCase().endsWith(".TXT")){
                    return null;
                }
                is = new FileInputStream(pdfFilePath);  
                PDFParser parser = new PDFParser(new RandomAccessBuffer(is));  
                parser.parse();  
                document = parser.getPDDocument();
                System.out.print(document);
                PDFTextStripper stripper = new PDFTextStripper();  
                System.out.println(pdfFilePath);
                result = stripper.getText(document); 
                FileWriter fw = new FileWriter(pdfFilePath+".txt",false);
                fw.write(result);
                fw.flush();
                fw.close();
            } catch (FileNotFoundException e) {  
                e.printStackTrace();  
            } catch (IOException e) {  
                e.printStackTrace();  
            }catch (Exception e){
                  e.printStackTrace();
            }finally {
                if (is != null) {  
                    try {  
                        is.close();  
                    } catch (IOException e) {  
                        e.printStackTrace();  
                    }  
                }  
                if (document != null) {  
                    try {  
                        document.close();  
                    } catch (IOException e) {  
                        e.printStackTrace();  
                    }  
                }  
            }  
            return pdfFilePath+".txt";  
        }  
}