How do I extract text under PDF link annotations (hyperlink text)?

Aaron_Gravesdale · December 6, 2006, 10:04pm

Q: How do I extract text under PDF link annotations (hyperlink text)?
-----------

A:
The following is a simple extension to TextExtract sample project
(http://www.pdftron.com/net/samplecode.html#TextExtract) that can be
used to extract text under link annotations (or any type of
annotation):

PDFNet.Initialize();
PDFNet.SetResourcesPath("../../../../../resources");

// Relative path to the folder containing test files.
string input_path = "../../../../TestFiles/";

TextExtractUtils util = new TextExtractUtils();
ElementReader reader = new ElementReader();

try
{
PDFDoc doc = new PDFDoc(input_path + "Song_book.pdf");
doc.InitSecurityHandler();

PageIterator page_end = doc.PageEnd();
PageIterator page_begin = doc.PageBegin();

  PageIterator itr;
  for (itr=page_begin; itr!=page_end; itr.Next()) // Read every page
  {
    Page page = itr.Current();
    int num_annots = page.GetNumAnnots();
    for (int i=0; i<num_annots; ++i) {
      Annot annot = page.GetAnnot(i);
      Rect annot_rect = annot.GetRect();
      string text_under_annot = util.ReadTextFromRect(page, annot_rect,
reader);
      Console.WriteLine("Text under annot: {0}", text_under_annot);

      // Get destination page number ...
      if (annot.GetType() == Annot.Type.e_Link) {
        Action action = annot.GetLinkAction();
        if (action.GetType() == Action.Type.e_GoTo) {
          int page_num = action.GetDest().GetPage().GetIndex();
          // ... do something with this info...
        }
      }
    }
  }

  doc.Close();
  Console.WriteLine("Done.");
}
catch (PDFNetException e)
{
  Console.WriteLine(e.Message);
}

Also attached is the entire sample project which you can extract under
PDFNet/Samples folder ('Song_book.pdf' is assumed to be in
PDFNet/Samples/TestFiles).