Q:
I have been using ProcessText code sample on PDF files for some time
and it works perfectly. I just ran across a PDF file today that this
routine is not picking up the text/chars as they are displayed in the
PDF file when viewed with the Adobe Reader. It finds the text but the
(char)char_code is all garbage.
Here is my code routine:
private void ProcessText(ElementReader page_reader)
{
Element element;
while ((element = page_reader.Next()) != null)
{
switch (element.GetType())
{
case Element.Type.e_text_end:
// Finish the text block
return;
case Element.Type.e_text:
{
GState gs = element.GetGState();
ColorSpace cs_fill =
gs.GetFillColorSpace();
ColorPt fill = gs.GetFillColor();
ColorPt outc = new ColorPt();
cs_fill.Convert2RGB(fill, outc);
ColorSpace cs_stroke =
gs.GetStrokeColorSpace();
ColorPt stroke = gs.GetStrokeColor();
Font font = gs.GetFont();
// Use element.GetCTM() if you are
interested in the CTM
// (current transformation matrix).
Matrix2D ctm = element.GetCTM();
Matrix2D text_mtx =
element.GetTextMatrix();
Matrix2D mtx = ctm * text_mtx;
double font_sz_scale_factor =
System.Math.Sqrt(mtx.m_b * mtx.m_b + mtx.m_d * mtx.m_d);
double font_size = gs.GetFontSize();
ColorPt font_color = gs.GetFillColor();
ColorSpace cs = gs.GetFillColorSpace();
ColorPt rgb = new ColorPt();
cs.Convert2RGB(font_color, rgb);
double x, y;
int char_code;
CharIterator end = element.CharEnd();
for (CharIterator itr =
element.CharBegin(); itr != end; itr.Next())
{
//Console.Write("Character code: ");
char_code = itr.Current().char_code;
//Console.Write((char)char_code);
x = itr.Current().x; // character
positioning information
y = itr.Current().y;
// To get the exact character
positioning information you need to
// concatenate current text matrix
with CTM and then multiply
// relative postitioning coordinates
with the resulting matrix.
//
mtx = ctm * text_mtx;
mtx.Mult(ref x, ref y);
char_code = char_code;
try
{
Console.WriteLine("(" +
char_code.ToString() + ") " + (char)char_code + " Position: x={0:f}
y={1:f}", x, y);
}
catch { }
}
break;
}
}
}
}
------
A:
'char_code' does not necessarily correspond to ASCII or Unicode value.
To obtain Unicode value you can use font.MapToUnicode() method:
In C++:
Unicode uni;
int uni_sz;
bool unicode_available = font.MapToUnicode(char_code, uni, uni_sz,
1);
In C#:
char val = 0;
int [] uni = new int[1];
int uni_sz;
if (font.MapToUnicode(char_code, uni, 1, ref uni_sz)) {
val = uni[0];
}
Alternatively you can use element.GetTextString() method which will
directly return the Unicode translation.