Convert PDF to CSV

PDFBox – Convert PDF to CSV

This article shows you how to use Apache PDFBox to connvert a PDF to CSV file in Java.

1. Get PDFBox

pom.xml
<dependency>
 <groupId>org.apache.pdfbox</groupId>
 <artifactId>pdfbox</artifactId>
 <version>2.0.6</version>
 </dependency>

2. PDF example

PDF sample data
pdf-data.png

3. Convert PDF to TXT file

PDF_TXT.java


package com.dzone.pdf_txt;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Scanner;
import java.util.regex.Pattern;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;

public class PDF_TXT {

	public static void main(String[] args) {

			File inputFile = new File("C:\\mkyong\\Hand Book.pdf");
			if(inputFile.exists()){
				System.out.println("Input File:"+inputFile.getAbsolutePath());
				PDF_TXT please = new PDF_TXT();

				File txtFile = please.convertPDFtoText(inputFile);
				assert(txtFile.exists());
				System.out.println("Text... File:"+txtFile.getAbsolutePath());

			}else{
				System.out.println("File does not exist:"+inputFile.getAbsolutePath());
			}
	}

	public File convertPDFtoText(File inputFile) {
		try {
			String newFileName = replaceSuffix(inputFile.getName(), ".txt");
			String newPath = inputFile.getAbsoluteFile().getParent()+File.separator+newFileName;
			File outFile = new File(newPath);
			PDDocument document = PDDocument.load(inputFile);
			PDFTextStripper stripper = new PDFTextStripper();
			stripper.writeText(document, new FileWriter(outFile));
			return outFile;
		} catch (IOException e) {
			throw new FailedException(e);
		}
	}
	private class FailedException extends RuntimeException {
		private static final long serialVersionUID = 2L;

		public FailedException(Exception e) {
			super(e);
		}
	}
	public static String replaceSuffix(String fileName, String suffix) {
		int index = fileName.indexOf('.', 0);
		if (index != -1) {
			int lastIndex = index;
			while (index != -1) {
				index = fileName.indexOf('.', lastIndex + 1);
				if (index != -1)
					lastIndex = index;
			}
			return fileName.substring(0, lastIndex) + suffix;
		} else {
			return fileName + "suffix";
		}
	}
}

Output



Input File:C:\dzone\Hand Book.pdf
Text... File:C:\dzone\Hand Book.txt

pdf-txt.png

 

4. Output in txt file

PDF to TXT

5. Convert TXT to CSV file

PDF_TXT_CSV.java


package com.dzone.pdf_csv;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Scanner;
import java.util.regex.Pattern;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;

import.com.mkyong.pdf_csv.PDF_TXT;

public class TXT_CSV {

	public static void main(String[] args) {

			File inputFile = new File("C:\\dzone\\Hand Book.pdf");
			if(inputFile.exists()){
				System.out.println("Input File:"+inputFile.getAbsolutePath());
				PDF_TXT_CSV please = new PDF_TXT_CSV();

				File txtFile = please.convertPDFtoText(inputFile);
				assert(txtFile.exists());
				System.out.println("Text... File:"+txtFile.getAbsolutePath());

				File csvFile = please.convertTextToCSV(txtFile);
				assert(csvFile.exists());
				System.out.println("CSV... File:"+csvFile.getAbsolutePath());

			}else{
				System.out.println("File does not exist:"+inputFile.getAbsolutePath());
			}
	}

	public File convertPDFtoText(File inputFile) {
		try {
			String newFileName = replaceSuffix(inputFile.getName(), ".txt");
			String newPath = inputFile.getAbsoluteFile().getParent()+File.separator+newFileName;
			File outFile = new File(newPath);
			PDDocument document = PDDocument.load(inputFile);
			PDFTextStripper stripper = new PDFTextStripper();
			stripper.writeText(document, new FileWriter(outFile));
			return outFile;
		} catch (IOException e) {
			throw new FailedException(e);
		}
	}
	public File convertTextToCSV(File inputFile) {
		try {
			String newFileName = replaceSuffix(inputFile.getName(), ".csv");
			String newPath = inputFile.getAbsoluteFile().getParent()+File.separator+newFileName;
			Scanner scanner = new Scanner(inputFile);
			File outFile = new File(newPath);
			PrintWriter writer = new PrintWriter(outFile);
			while (scanner.hasNextLine()) {
				String line = scanner.nextLine();
				String result = convertLineToCSV(line);
				if (result != null)
					writer.println(result);
			}
			writer.close();
			return outFile;
		} catch (FileNotFoundException e) {
			throw new FailedException(e);
		} catch (IOException e) {
			throw new FailedException(e);
		}
	}
	private class FailedException extends RuntimeException {
		private static final long serialVersionUID = 2L;

		public FailedException(Exception e) {
			super(e);
		}
	}
	private String convertLineToCSV(String line) {
		String[] fields = line.split("\\s+");
		if (fields.length < 10)
			return null;
		StringBuilder builder = new StringBuilder();
		for (int i = 10; i >= 1; i--) {
			if (i < 10)
				builder.append("|");
			builder.append(fields[fields.length - i].replace(",", ""));
		}
		String result = builder.toString();
		if (isQuoteLine(result))
			return result;
		return null;
	}

	private static boolean isQuoteLine(String line) {
		String pattern = "[A-Z0-9]+" + "\\|(\\-)?(\\d+(\\.\\d+)?)?"
				+ "\\|(\\-)?(\\d+(\\.\\d+)?)?" + "\\|(\\-)?(\\d+(\\.\\d+)?)?"
				+ "\\|(\\-)?(\\d+(\\.\\d+)?)?" + "\\|(\\-)?(\\d+(\\.\\d+)?)?"
				+ "\\|(\\-)?(\\d+(\\.\\d+)?)?" + "\\|(\\-)?(\\d+(\\.\\d+)?)?"
				+ "\\|(\\-)?(\\d+(\\.\\d+)?)?" + "\\|(\\-)?(\\(?\\d+(\\.\\d+)?\\)?)?";
		return Pattern.matches(pattern, line);
	}

	private static String replaceSuffix(String fileName, String suffix) {
		int index = fileName.indexOf('.', 0);
		if (index != -1) {
			int lastIndex = index;
			while (index != -1) {
				index = fileName.indexOf('.', lastIndex + 1);
				if (index != -1)
					lastIndex = index;
			}
			return fileName.substring(0, lastIndex) + suffix;
		} else {
			return fileName + "suffix";
		}
	}
}

Output



Input File:C:\dzone\Hand Book.pdf
Text... File:C:\dzone\Hand Book.txt
CSV... File:C:\dzone\Hand Book.csv

txt-csv.png

4. Output in CSV file

TXT to CSV

References

  1. Apache PDFBox

 

Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

Blog at WordPress.com.

Up ↑

%d bloggers like this: