Highlighting words in PDF based of the user input (in JAVA).

Aaron_Gravesdale · October 15, 2008, 6:58pm

Q: We have the requirement to highlight the word in the PDF on the
basis of user input (In JAVA). We are using the PDFTron's
TextExtractor.Word class for this purpose. For example,

Scenario 1: if user specifies a word "chan", than we want that only
this text (i.e. 'chan' ) is highlighted in the PDF.

Right now we are actually getting that text like "change", "chance",
"changing" are also been highlighted in the PDF. We don't want these
texts to be highlighted.

We want to extract and highlight the exact word (i.e. chan) from PDF.

Also please find the code snippet with this message:

package src;

import java.util.Iterator;

import pdftron.Common.PDFNetException;
import pdftron.PDF.Annot;
import pdftron.PDF.ColorPt;
import pdftron.PDF.ColorSpace;
import pdftron.PDF.Element;
import pdftron.PDF.ElementBuilder;
import pdftron.PDF.ElementReader;
import pdftron.PDF.ElementWriter;
import pdftron.PDF.GState;
import pdftron.PDF.PDFDoc;
import pdftron.PDF.PDFDraw;
import pdftron.PDF.PDFNet;
import pdftron.PDF.Page;
import pdftron.PDF.Rect;
import pdftron.PDF.TextExtractor;

import pdftron.*;
import pdftron.Common.*;
import pdftron.Filters.*;
import pdftron.SDF.*;
import pdftron.PDF.*;
import pdftron.PDF.Rect.*;

public class PDFHighLight2 {

  public static void main(String[] args) {
         PDFNet.initialize();
     String input_path="C:\\testpdf\\";
       PDFNet.setResourcesPath("E:\\PDFNetC\\resources ");
       TextExtractor.Word word=null;
       java.io.File f = new java.io.File("C:\\f1.txt");
      Iterator itr=null;
       String stringword=null;
       Rect Word_BBox= null;
       try
       {
        PDFDoc doc = new PDFDoc(input_path + "0809030833.pdf");
        Word_BBox= new Rect();
          doc.initSecurityHandler();
          ColorPt highlight_color = new ColorPt(1, 1, 0); // Yellow
          TextExtractor txt = new TextExtractor();

//itr= doc.getPageIterator();
                Page pg =null;
         int pageNum=1;
        int totalPage=0;
       totalPage=doc.getPageCount();
       TextExtractor.Line line = null;

//pg = (Page)itr.next();
//txt.begin(pg);

          while(pageNum<=totalPage -1)
          {
      pg = (Page)doc.getPage(pageNum);
           txt.begin(pg);
  line=txt.getFirstLine();
            for(int k = 0; k<=txt.getNumLines()-1;k++)
            {
                for(int i = 0;i<=line.getNumWords()-1;i++)
                {
            /* while (word.isValid())
              {*/
            // for (int j=0;j<=i;j++)
             // {
                word= line.getWord(i);

if(word!=null)

if(contains(word.getString(),"Chan")&&
((contains(word.getString(),","))||(contains(word.getString(),"."))))

//if(word.getString().equals("Paul"))
{

stringword= word.getString();

                 Word_BBox =word.getBBox();
        // pg.AnnotPushBack(CreateHighlightAnnot(doc, Word_BBox,
          // highlight_color));
                 pg.annotPushBack(CreateHighlightAnnot(doc, Word_BBox,
                     highlight_color,stringword));

// doc.save(input_path + "latest.pdf",0,null);

}
word=null;

}

// }

// break;

                if(line.isValid())
                 line =line.getNextLine();
              }
             // break;
          // if(pageNum==40)

            pageNum++;
            }
           // break;

// Used to extract words
doc.save(input_path + "latest.pdf",0,null);

        } catch (PDFNetException e)
        {
       e.printStackTrace();

}

   public static Obj CreateHighlightAppearance(PDFDoc doc, Rect bbox,
ColorPt higlight_color,String word )
         {
        Obj stm=null;
        ElementBuilder build=null;
        ElementWriter writer=null;
        Element element=null;
          // Create a button appearance stream
         try
           {
         build = new ElementBuilder();
           writer = new ElementWriter();

writer.begin(doc);

          // Draw background
       element = build.createRect(bbox.getX1()- 2, bbox.getY1(),
bbox.getX2() +
              2, bbox.getY2());

byte[] bytes = word.getBytes();

// element.setTextData("hello".getBytes());

          element.setPathFill(true);
         // if(element.isWindingFill())
         // element.setWindingFill(true);

// element.setTextData(word.getBytes());
element.setPathStroke(false);

          GState gs = element.getGState();
          gs.setFillColorSpace(ColorSpace.createDeviceRGB());
          gs.setFillColor(higlight_color);

// gs.SetBlendMode(GState.BlendMode.e_bl_multiply);
gs.setBlendMode(GState.e_bl_multiply);

writer.writeElement(element);
stm = writer.end();

          // Set the bounding box
          stm.putRect("BBox",bbox.getRectangle());
         // stm.Put("Subtype", Obj.CreateName("Form"));
          writer.writeElement(element);
          }catch(Exception ex)
          {
            ex.printStackTrace();
          }
          return stm;
     }

  public static Annot CreateHighlightAnnot(PDFDoc doc, Rect bbox,
ColorPt
       highlight_color,String word)
  {
   Annot a=null;
         {

try
{

a = Annot.create(doc,Annot.e_Highlight,bbox);
a.setColor(highlight_color);

          a.setAppearance(CreateHighlightAppearance(doc, bbox,
          highlight_color,word));
          }catch(Exception ex)
          {ex.printStackTrace();}

       /* Obj quads = Obj.CreateArray();
          a.GetSDFObj().Put("QuadPoints", quads);
          quads.PushBack(Obj.CreateNumber(bbox.getX1()));
          quads.PushBack(Obj.CreateNumber(bbox.getY2()));
          quads.PushBack(Obj.CreateNumber(bbox.x2));
          quads.PushBack(Obj.CreateNumber(bbox.y2));
          quads.PushBack(Obj.CreateNumber(bbox.x1));
          quads.PushBack(Obj.CreateNumber(bbox.y1));
          quads.PushBack(Obj.CreateNumber(bbox.x2));
          quads.PushBack(Obj.CreateNumber(bbox.y1));*/
          return a;
         }

  }
  public static boolean contains(String full, String searched) {
    if(full.indexOf(searched) != -1)
    return true;
    else { return false; }
  }
}
-------------
A: Looking over your code it seems that these words are being
highlighted due to your string match condition:

if(contains(word.getString(),"Chan")&&
((contains(word.getString(),","))||(contains(word.getString(),"."))))

You would need to change your string match condition. For example:

word.getString().Equals("Chan") ....

It the comparison logic is more complex you could use JAVA regular
expressions to test for a string match

word.getString().matches(...)

In any case this issue doesn't seem to be related to PDFNet API
itself.