Q: I am having some trouble with word bounding box coordinates
extracted using TextExtractor. I would like to draw a rectangle around
specific words on the page.
I have these 2 functions: CheckElements that checks for every page if
some text is found and if the text is found it calls DrawBox to draw a
box around the found word. The problem is that it works for some pages
and doesn't work for others. Any idea what could be the problem?
-----
A: The problem is that the origin of the media box for the source PDF
page is not [0, 0]. Instead the media box is [30.024, 30.024, 623.97,
821.97].
word.GetBBox() returns the bounding box in page coordinate systems
without taking account of page's /Rotate or /CropBox /MeduaBox
parameters.
In order to take into account these attributes, you need to transform
the bounding box returned by TextExtractor.GetWord() using
page.GetDefaultMatrix().
For example:
// C++ sample (C#/Java is along the same lines)
void CheckElements(Page src_page,string tofind) {
ElementReader reader;
reader.Begin(src_page);
TextExtractor txt;
double out_bbox[4];
txt.Begin(src_page,0, TextExtractor::e_remove_hidden_text);
UString text;
txt.GetAsText(text);
TextExtractor::Line line = txt.GetFirstLine();
TextExtractor::Word word;
Matrix2D mtx(src_page.GetDefaultMatrix());
for (; line.IsValid(); line=line.GetNextLine()) {
for (word=line.GetFirstWord(); word.IsValid();
word=word.GetNextWord()) {
text.Assign(word.GetString(), word.GetStringLen());
string::size_type loc = text.ConvertToAscii().find( tofind, 0 );
if( loc != string::npos) {
word.GetBBox(out_bbox);
mtx.Mult(out_bbox[0], out_bbox[1]);
mtx.Mult(out_bbox[2], out_bbox[3]);
cout << out_bbox[0] << " " << out_bbox[1] << " " <<
out_bbox[2] << " " << out_bbox[3] << endl;
DrawBox(src_page,Rect(out_bbox[0],out_bbox[1],out_bbox[2],out_bbox[3]));
}
}
}
}
void DrawBox(Page &dest_page,Rect bbox)
{
ElementBuilder eb;
ElementWriter writer;
writer.Begin(dest_page);
Element element = eb.CreateRect(bbox.x1, bbox.y1, bbox.Width(),
bbox.Height());
element.SetPathStroke(true);
GState gstate = element.GetGState();
gstate.SetFillColorSpace(ColorSpace::CreateDeviceRGB());
gstate.SetFillColor(ColorPt(1, 0, 0));
gstate.SetLineWidth(2);
writer.WriteElement(element);
writer.End();
}