Apache PDFBox provides various classes like org.apache.pdfbox.text.PDFTextStripper to read text from PDF files. We will see steps on how reading text from pdf using Apache PDFBox.
We have a sample PDF that looks as below
Now lets use the PDFTextStripper class and read the text from the above PDF.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
package com.kscodes.examples.pdfbox; import java.io.FileInputStream; import java.io.IOException; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; public class PdfReaderExample { public static void main(String args[]) { try { // load Document object with existing pdf. PDDocument pdDocument = PDDocument.load(new FileInputStream("K:\\Kscodes\\pdf\\pdf-sample.pdf")); // Initalize the PDFTextStripper object PDFTextStripper pdfTextStripper = new PDFTextStripper(); // Read the pdf using the PDFTextStripper object String pdfText = pdfTextStripper.getText(pdDocument); // Print the text System.out.println("*************"); System.out.println(pdfText); pdDocument.close(); } catch (IOException ioe) { System.out.println("Error while reading pdf" + ioe.getMessage()); } } } |
Output