如何利用pdfbox将pdf解析为txt

秒速五厘米 2022-06-08 02:08 330阅读 0赞

利用apache的pdfbox将pdf解析为txt文件,需要的最基本包如下:

pdfbox-0.7.3.jar

fontbox-2.0.7.jar

commons-logging-1.2.jar

点我下载:点击打开下载链接

下面是以d盘pdf目录下的pdf文件为例:

  1. package com;
  2. import java.io.BufferedReader;
  3. import java.io.File;
  4. import java.io.FileInputStream;
  5. import java.io.FileNotFoundException;
  6. import java.io.FileWriter;
  7. import java.io.IOException;
  8. import java.io.InputStreamReader;
  9. import java.sql.SQLException;
  10. import org.apache.pdfbox.io.RandomAccessBuffer;
  11. import org.apache.pdfbox.pdfparser.PDFParser;
  12. import org.apache.pdfbox.pdmodel.PDDocument;
  13. import org.apache.pdfbox.text.PDFTextStripper;
  14. public class CsrPdfRead {
  15. /**
  16. *读取文本文件解析文本
  17. * @param args
  18. * @throws InterruptedException
  19. * @throws SQLException
  20. */
  21. public static void main(String[] args) throws InterruptedException, SQLException {
  22. String path="d:/pdf";
  23. String filepath="";
  24. File file=new File(path);
  25. File[] tempList = file.listFiles();
  26. System.out.println("该目录下对象个数:"+tempList.length);
  27. for (int i = 0; i < tempList.length; i++) {
  28. if (tempList[i].isFile()) {
  29. filepath=path+"/"+tempList[i].getName();
  30. System.out.println(path+"/"+filepath);
  31. if(!filepath.toUpperCase().endsWith(".TXT")){
  32. System.out.println(getOrderText(filepath));
  33. }
  34. }
  35. if (tempList[i].isDirectory()) {
  36. System.out.println("文件夹:"+tempList[i]);
  37. }
  38. }
  39. }
  40. public static String getOrderText(String filepath){
  41. String filetxt="";
  42. try {
  43. String encoding="GBK";
  44. String newfilepath= getTextFromPDF(filepath);
  45. if(newfilepath==null){
  46. return "";
  47. }
  48. File file=new File(newfilepath);
  49. if(file.isFile() && file.exists()){ //判断文件是否存在
  50. InputStreamReader read = new InputStreamReader(
  51. new FileInputStream(file),encoding);//考虑到编码格式
  52. BufferedReader bufferedReader = new BufferedReader(read);
  53. String lineTxt = null;
  54. int a = 0;
  55. int b = 0;
  56. String bb = "";
  57. while((lineTxt = bufferedReader.readLine()) != null){
  58. //System.out.println(lineTxt);
  59. if(lineTxt==null || lineTxt.equals("")||lineTxt.trim().length()==0){
  60. continue;
  61. }
  62. if(lineTxt.contains("Order ID")){
  63. String[] oid = lineTxt.split("\\u0029\\s*P");
  64. if(oid.length>0){
  65. bb+=oid[0].substring(oid[0].lastIndexOf("(")+10);
  66. b++;
  67. }
  68. }
  69. if(lineTxt.contains("-001")){
  70. if(lineTxt.lastIndexOf("-001")-11>=0){
  71. bb+=" "+lineTxt.substring(lineTxt.lastIndexOf("-001")-11,lineTxt.lastIndexOf("-001")+4);
  72. b++;
  73. }
  74. }
  75. if(lineTxt.contains("end of line")){
  76. // System.out.println(++a);
  77. if(b==2){
  78. System.out.println(bb);
  79. //filetxt+=++a+":"+bb+"/r/n";
  80. bb="";
  81. b=0;
  82. }
  83. }
  84. }
  85. read.close();
  86. }else{
  87. System.out.println("找不到指定的文件");
  88. }
  89. } catch (Exception e) {
  90. System.out.println("读取文件内容出错");
  91. e.printStackTrace();
  92. }
  93. return filetxt;
  94. }
  95. public static String getTextFromPDF(String pdfFilePath) {
  96. String result = null;
  97. FileInputStream is = null;
  98. PDDocument document = null;
  99. try {
  100. if(pdfFilePath.toUpperCase().endsWith(".TXT")){
  101. return null;
  102. }
  103. is = new FileInputStream(pdfFilePath);
  104. PDFParser parser = new PDFParser(new RandomAccessBuffer(is));
  105. parser.parse();
  106. document = parser.getPDDocument();
  107. System.out.print(document);
  108. PDFTextStripper stripper = new PDFTextStripper();
  109. System.out.println(pdfFilePath);
  110. result = stripper.getText(document);
  111. FileWriter fw = new FileWriter(pdfFilePath+".txt",false);
  112. fw.write(result);
  113. fw.flush();
  114. fw.close();
  115. } catch (FileNotFoundException e) {
  116. e.printStackTrace();
  117. } catch (IOException e) {
  118. e.printStackTrace();
  119. }catch (Exception e){
  120. e.printStackTrace();
  121. }finally {
  122. if (is != null) {
  123. try {
  124. is.close();
  125. } catch (IOException e) {
  126. e.printStackTrace();
  127. }
  128. }
  129. if (document != null) {
  130. try {
  131. document.close();
  132. } catch (IOException e) {
  133. e.printStackTrace();
  134. }
  135. }
  136. }
  137. return pdfFilePath+".txt";
  138. }
  139. }

结果:

2017091118370674220170911183718289

发表评论

表情:
评论列表 (有 0 条评论,330人围观)

还没有评论,来说两句吧...

相关阅读