How do I adjust page with rotated text so all text is right side up?

Question:

Sometimes we get PDF documents which have the text right side up, but other times the text is rotated one way or another, so it appears side ways. The page dimensions for both documents are normal 8.5x11. Also there is no rotation setting on the pages, meaning Page.GetRotation always returns e_0.

Furthermore, some documents are mixed, so some pages are rotated clock wise, and other pages are rotated counter clock wise.

How would I automatically rotate the pages so all text is rotated right side up?

Answer:

Assuming the following pre-conditions:

  1. That the pages contain actual text, and not images or vector paths that make up the text
  2. That the majority of the text is rotated the desired direction.

Then the following code will iterate all text in a page, storing the population count of of the angles, and then taking the most populous count, rotates the page that way. This works regardless of any existing page rotation setting.

//---------------------------------------------------------------------------------------
// Copyright (c) 2001-2014 by PDFTron Systems Inc. All Rights Reserved.
// Consult legal.txt regarding legal and license information.
//---------------------------------------------------------------------------------------
// A sample project illustrating some extraction capabilities of ElementReader
// in more detail
//---------------------------------------------------------------------------------------

using System;
using pdftron;
using pdftron.Common;
using pdftron.Filters;
using pdftron.SDF;
using pdftron.PDF;
using System.Windows;
using System.Windows.Media;
using System.Collections.Generic;
using System.Linq;

namespace ElementReaderAdvTestCS
{
class Class1
{
private static pdftron.PDFNetLoader pdfNetLoader = pdftron.PDFNetLoader.Instance();

static System.Collections.Generic.Dictionary<double, int> angles;

static public void ProcessText(ElementReader page_reader)
{
Element element;
while ((element = page_reader.Next()) != null)
{
switch (element.GetType())
{
case Element.Type.e_text_end:
return;

case Element.Type.e_text:
{
Matrix2D ctm = element.GetCTM();
Matrix2D text_mtx = element.GetTextMatrix();
Matrix2D mtx = ctm * text_mtx;
mtx = ctm * text_mtx;
Matrix matrix = new Matrix(mtx.m_a, mtx.m_b, mtx.m_c, mtx.m_d, 0, 0);
Vector x = new Vector(0, 1);
Vector rotated = Vector.Multiply(x, matrix);
double angleBetween = Vector.AngleBetween(x, rotated);
if(angleBetween % 90 == 0)
{
// next two lines normalize rotation
angleBetween = angleBetween % 360;
if (angleBetween < 0) angleBetween = 360 + angleBetween;
// now store population
if (angles.ContainsKey(angleBetween)) angles[angleBetween]++;
else angles[angleBetween] = 1;
}
break;
}
}
}
}

static void ProcessElements(ElementReader reader)
{
Element element;

while ((element = reader.Next()) != null) // Read page contents
{
switch (element.GetType())
{
case Element.Type.e_text_begin: // Process text strings...
{
ProcessText(reader);
break;
}
}
}
}

[STAThread]
static void Main(string[] args)
{
try
{
PDFNet.Initialize();
using (PDFDoc doc = new PDFDoc(@"path_pdf"))
{
doc.InitSecurityHandler();

int pgnum = doc.GetPageCount();
PageIterator itr;

using (ElementReader page_reader = new ElementReader())
{
for (itr = doc.GetPageIterator(); itr.HasNext(); itr.Next()) // Read every page
{
angles = new Dictionary<double, int>();
Page page = itr.Current();
page_reader.Begin(page);
ProcessElements(page_reader);
page_reader.End();
if(angles.Count > 0)
{
List<KeyValuePair<double, int>> sorted_angles = angles.ToList();
sorted_angles.Sort((firstPair, nextPair) =>
{
return nextPair.Value.CompareTo(firstPair.Value);
});
double angle = sorted_angles.First().Key;
if (angle == 90) page.SetRotation(Page.Rotate.e_90);
else if (angle == 180) page.SetRotation(Page.Rotate.e_180);
else if (angle == 270) page.SetRotation(Page.Rotate.e_270);
}
}
}
doc.Save(@"output_path", SDFDoc.SaveOptions.e_remove_unused);
Console.WriteLine("Done.");
}
}
catch (PDFNetException e)
{
Console.WriteLine(e.Message);
}
}
}
}

ElementReaderAdvTest.cs (4.23 KB)