How do I create a complex 'highlight' annotation to highlight scattered, multiline text in PDF?

Q: I was able to successfully search and highlight words in a PDF file
using PDFNet. The framework currently allows to search words (using
the TextExtractor class) and highlight them in the file.

I can easily achieve highlighting for a group of adiacent words on the
same line by searching adiacent words and by binding together the
bounding rectangles which correspond to those words.

Currently I cannot figure out wether is possible(and if it is
possible, how?) to highlight a scattered paragraph of text which
overlaps on two or more lines and to have ONLY one annotation for the
highlighted region. For this I would need to specify a polygon for the
highlighted text zone and also for the active area(The area on which
the user can click to show the annotation popup).

I tried to use polygon or polyline as annotation type when I create
the annotation, but the parameter which specifies the highlighted area
is still a rectangle.

It might be possible that I’m looking at solving the problem from a
wrong point of view, and maybe you could point me in the right
direction.
-------
A: To create a more complex highlight you could add additional
rectangles to Quads array (in the highlight annotation dictionary) and
extend the appearance generation code by drawing a polyline instead of
a single rectangle.

For example [this is only pseudocode]:

// Create a Highlight Annotation for multiple boxes (e.g. for a
multiline, scattered paragraph).

static Annot CreateHighlightAnnot(PDFDoc doc, List<Rect> bboxes,
ColorPt highlight_color) {
   Annot a = Annot.Create(doc, Annot.Type.e_Highlight, RectangleUnion
(bboxes));
   a.SetColor(highlight_color);
   a.SetAppearance(CreateHighlightAppearance(doc, bboxes,
highlight_color));

   Obj quads = a.GetSDFObj().PutArray("QuadPoints");
   Rect box;
   foreach box in bboxes
   {
     quads.PushBackNumber(bbox.x1);
     quads.PushBackNumber(bbox.y2);
     quads.PushBackNumber(bbox.x2);
     quads.PushBackNumber(bbox.y2);
     quads.PushBackNumber(bbox.x1);
     quads.PushBackNumber(bbox.y1);
     quads.PushBackNumber(bbox.x2);
     quads.PushBackNumber(bbox.y1);
   }
   return a;
}

// Use PDFNet to generate appearance stream for highlight annotation.
static Obj CreateHighlightAppearance(PDFDoc doc, Rect bboxes, ColorPt
higlight_color) {
   ElementBuilder build = new ElementBuilder();

   // Construct a polygon...
   build.PathBegin();

   Rect box;
   foreach box in bboxes {
     // Add a rectangle to the current path as a complete subpath.
     build.Rect(bbox.x1 - 2, bbox.y1, bbox.x2 + 2, bbox.y2);
   }

   Element element = build.PathEnd();
   element.SetPathFill(true);
   element.SetPathStroke(false);
   GState gs = element.GetGState();
   gs.SetFillColorSpace(ColorSpace.CreateDeviceRGB());
   gs.SetFillColor(higlight_color);
   gs.SetBlendMode(GState.BlendMode.e_bl_multiply);

   ElementWriter writer = new ElementWriter();
   writer.Begin(doc);
   writer.WriteElement(element);
   Obj stm = writer.End();

   build.Dispose();
   writer.Dispose();

   // Set the bounding box
   bbox = RectangleUnion(bboxes));
   stm.PutRect("BBox", bbox.x1, bbox.y1, bbox.x2, bbox.y2);
   stm.PutName("Subtype", "Form");
   return stm;
}

Q: Thank you very much for the code samples. I want to use the code
above for creating highlight annotations. There is one thing that I do
not understand in the code: the RectangleUnion function. It takes a
List<Rect> and it returns a Rect (because it needs to be passed as a
parameter in the Annot.Create function, and this accepts a Rect as a
parameter).

How can this be done (converting a group of rectangles to only one
rectangle ) when the rectangles can be scattered on many lines?
Thanks.
-----
A: To compute union of all rectangles you would enumerate the list of
rectangles and would keep track of coordinates with lowest x1, y1 and
highest x2, y2 values.

For example:

// This is only pseudocode
Rect RectUnion(List<Rect> bboxes)
  Double x1=rect[0].x1, y1=rect[0].y1, x2=rect[0].x2, y2=rect[0].y2;
  For (i=1...rectnum) {
    if (rect[i].x1 < x1) x1=rect[i].x1
    if (rect[i].y1 < y1) y1=rect[i].y1
    if (rect[i].x2 > x2) x2=rect[i].x2
    if (rect[i].y2 > y2) y2=rect[i].y2
    return Rect(x1, y1, x2, y2);
}

Q: Thanks PDF text highlighting for complex regions works fine now,
however I am getting some errors in the older versions of Acrobat
Reader. Can you spot anything in my highlighting code:

var highlightRegions = new List<HighlightRegionInfo>();
HighlightRegions(highlightRegions, highlightColor, file);
file.SaveAs(...);
file.Close();

public Rect RectUnion(List<Rect> boxes)
{
    double y1 = boxes.Select(box => box.y1).Min();
    double y2 = boxes.Select(box => box.y2).Max();
    double x1 = boxes.Where(box => box.y1 == y1).Select(box =>
box.x1).Min();
    double x2 = boxes.Where(box => box.y2 == y2).Select(box =>
box.x2).Max();
    return new Rect(x1, y1, x2, y2);
}

public void CreateHighlightAnnotation(PDFDoc pdfDocument,
HighlightRegionInfo region, Color highlightColor, int pageNumber)
{
    List<Rect> boxes = region.Words.Select(word => word.Box).ToList();

    Annot annotation = Annot.Create(pdfDocument,
Annot.Type.e_Highlight, RectUnion(boxes));
    ColorPt pdfHighlightColor = highlightColor.ToPdfColor();
    annotation.SetColor(pdfHighlightColor);

    var t = new Text(annotation);
    t.SetTextContents(region.Text);

    annotation.SetDate(DateTime.Now.ToPdfDate());
    annotation.SetBorderStyle(new Annot.BorderStyle
(Annot.BorderStyle.Style.e_solid, 1));

    Page page = pdfDocument.GetPage(pageNumber);
    Obj quads = annotation.GetSDFObj().PutArray("QuadPoints");
    List<Rect> lineRectangles = boxes.GroupBy(box => box.y1)
        .Select(line => RectUnion(line.ToList()))
        .ToList();
    lineRectangles.ForEach(lineRect => PushBackBox(quads, lineRect));
    annotation.SetAppearance(CreateHighlightAppearance(lineRectangles,
pdfHighlightColor, page));
    page.AnnotPushBack(annotation);
}

private void PushBackBox(Obj quads, Rect box)
{
    quads.PushBackNumber(box.x1);
    quads.PushBackNumber(box.y2 + 2);
    quads.PushBackNumber(box.x2);
    quads.PushBackNumber(box.y2 + 2);
    quads.PushBackNumber(box.x1);
    quads.PushBackNumber(box.y1 + 2);
    quads.PushBackNumber(box.x2);
    quads.PushBackNumber(box.y1 + 2);
}

public Obj CreateHighlightAppearance( /*PDFDoc pdfDocument,*/
    List<Rect> boxes, ColorPt highlightColor, Page page)
{
    var elementBuilder = new ElementBuilder();
    elementBuilder.PathBegin();

    boxes.ForEach(box => elementBuilder.Rect(box.x1 - 2, box.y1,
box.x2 - box.x1 + 2, box.y2 - box.y1 + 2));

    Element element = elementBuilder.PathEnd();

    element.SetPathFill(true);
    element.SetPathStroke(false);

    GState elementGraphicState = element.GetGState();
    elementGraphicState.SetFillColorSpace(ColorSpace.CreateDeviceRGB
());
    elementGraphicState.SetFillColor(highlightColor);
    elementGraphicState.SetBlendMode(GState.BlendMode.e_bl_multiply);

    var elementWriter = new ElementWriter();
    // elementWriter.Begin(page, true); <------------!!!!
    elementWriter.Begin();

    elementWriter.WriteElement(element);
    Obj highlightAppearance = elementWriter.End();

    elementBuilder.Dispose();
    elementWriter.Dispose();

    //Set the bounding box
    Rect boundingBox = RectUnion(boxes);
    highlightAppearance.PutRect("BBox", boundingBox.x1,
boundingBox.y1, boundingBox.x2, boundingBox.y2);
    highlightAppearance.PutName("Subtype", "Form");

    return highlightAppearance;
}
-------
A: The problem is that you are adding highlight appearances to an
existing page (elementWriter.Begin(page, true)) instead of separate
appearance stream (elementWriter.Begin()). To fix this simply replace:

elementWriter.Begin(page, true);

with

elementWriter.Begin();

Thanks for the explanation. It worked.
I used: elementWriter.Begin(page.GetSDFObj().GetDoc());
I think this is what you meant, because there's no overload for
elementWriter.Begin() which takes no parameter.