Q: How do I extract text under PDF link annotations (hyperlink text)?
-----------
A:
The following is a simple extension to TextExtract sample project
(http://www.pdftron.com/net/samplecode.html#TextExtract) that can be
used to extract text under link annotations (or any type of
annotation):
PDFNet.Initialize();
PDFNet.SetResourcesPath("../../../../../resources");
// Relative path to the folder containing test files.
string input_path = "../../../../TestFiles/";
TextExtractUtils util = new TextExtractUtils();
ElementReader reader = new ElementReader();
try
{
PDFDoc doc = new PDFDoc(input_path + "Song_book.pdf");
doc.InitSecurityHandler();
PageIterator page_end = doc.PageEnd();
PageIterator page_begin = doc.PageBegin();
PageIterator itr;
for (itr=page_begin; itr!=page_end; itr.Next()) // Read every page
{
Page page = itr.Current();
int num_annots = page.GetNumAnnots();
for (int i=0; i<num_annots; ++i) {
Annot annot = page.GetAnnot(i);
Rect annot_rect = annot.GetRect();
string text_under_annot = util.ReadTextFromRect(page, annot_rect,
reader);
Console.WriteLine("Text under annot: {0}", text_under_annot);
// Get destination page number ...
if (annot.GetType() == Annot.Type.e_Link) {
Action action = annot.GetLinkAction();
if (action.GetType() == Action.Type.e_GoTo) {
int page_num = action.GetDest().GetPage().GetIndex();
// ... do something with this info...
}
}
}
}
doc.Close();
Console.WriteLine("Done.");
}
catch (PDFNetException e)
{
Console.WriteLine(e.Message);
}
Also attached is the entire sample project which you can extract under
PDFNet/Samples folder ('Song_book.pdf' is assumed to be in
PDFNet/Samples/TestFiles).