1
Vote

Built-in PDF text extractor is terrible

description

I found that the built-in iTextSharp Pdf Processor quite simply does not work on most PDF documents.

So I searched and found this: http://www.squarepdf.net/pdfbox-in-net-download/

I find that this pdf processor is much better at text extraction than iTextSharp.

I'm sure not all of this is necessary, but the way I set it up in my project is as follows:
  1. I included all the dlls in my C# project (they are under a subdirectory to keep it somewhat clean).
  2. I added references to pdfbox, IKVM.OpenJDK.Core, and IKVM.OpenJDK.SwingAWT dlls.
  3. I created the following pipeline step to run it:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using NCrawler.Interfaces;
using System.IO;
using org.apache.pdfbox.pdmodel;
using org.apache.pdfbox.util;

namespace PDFTextExtraction
{
    public class PDFPipelineStep : IPipelineStep
    {
        #region IPipelineStep Members

        public void Process(NCrawler.Crawler crawler, NCrawler.PropertyBag propertyBag)
        {
            if (propertyBag.ContentType.StartsWith("application/pdf", StringComparison.OrdinalIgnoreCase))
            {
                ikvm.io.InputStreamWrapper inputStream = null;
                PDDocument doc = null;
                try
                {
                    using (Stream input = propertyBag.GetResponse())
                    {
                        inputStream = new ikvm.io.InputStreamWrapper(input);
                        doc = PDDocument.load(inputStream);
                        PDDocumentInformation info = doc.getDocumentInformation();
                        propertyBag.Title = info.getTitle();
                        PDFTextStripper stripper = new PDFTextStripper();
                        propertyBag.Text = stripper.getText(doc);
                    }
                }
                catch (Exception ex)
                {
                    Console.WriteLine(ex.ToString());
                }
                finally
                {
                    try
                    {
                        if (doc != null)
                            doc.close();
                        if (inputStream != null)
                            inputStream.close();
                    }
                    catch (Exception) { }
                }
            }
        }

        #endregion
    }
}

comments