Q:
I am using the apprach illustrated in ElementEdit sample project
(http://www.pdftron.com/net/samplecode.html#ElementEdit) to separate
PDF content into separate layers (one layer for text, one layer for
images, etc).
The code generates new pages by separating text and image from a given
PDF, and uses PDFRasterizer class to output new pages into bitmaps.
It ran into problem with one PDF sample that does not capture images
on the new pages.
The current PDF page separation code is along the following lines:
void SeparatePDFPageIntoLayers(ElementReader reader,
ElementWriter S, ElementWriter I, ElementWriter GC)
{
Element element;
while ((element = reader.Next()) != null) {
Element.Type type = element.GetType();
if (type == Element.Type.e_text ||
type == Element.Type.e_text_begin ||
type == Element.Type.e_text_end ||
type == Element.Type.e_text_new_line) {
GState gs = element.GetGState();
// Change text render mode to FILL
gs.SetTextRenderMode(GState.TextRenderingMode.e_fill_text);
GC.WriteElement(element);
gs.SetFillColorSpace(ColorSpace.CreateDeviceRGB());
gs.SetFillColor(new ColorPt(0, 0, 0));
S.WriteElement(element);
}
else if (type == Element.Type.e_image || type ==
Element.Type.e_inline_image ||
type == Element.Type.e_path || type ==
Element.Type.e_shading)
{
I.WriteElement(element);
// Manually output the transform for the image (or form) being
skipped.
Matrix2D m = element.GetGState().GetTransform();
S.WriteString(m.m_a + " " + m.m_b + " " + m.m_c + " " + m.m_d + " "
+ m.m_h + " " + m.m_v + " cm ");
GC.WriteString(m.m_a + " " + m.m_b + " " + m.m_c + " " + m.m_d + "
" + m.m_h + " " + m.m_v + " cm ");
}
else if (type == Element.Type.e_form) {
reader.FormBegin();
// recurision point
SeparatePDFPageIntoLayers(reader, S, I, GC);
reader.End();
}
else {
S.WriteElement(element);
I.WriteElement(element);
GC.WriteElement(element);
}
}
}
------
A:
PDFNet SDK is returning correct results. The problem is that in this
PDF document, text is used as a clipping path to clip images and other
content. This means that PDF viewer (such as pdftron.PDF.PDFDraw,
pdftron.PDF.PDFView, or Acrobat) must convert glyph outlines to a path
which is then used to clip subsequent images. The images may appear
that they are not in the PDF file, but the problem is that they are
stacked on top of each other (obscuring images that lie beneath them).
Similarly the extracted text appears as black because that is the
default fill color for text. The original text was used as a clip path
(i.e. element.GetGState().GetTextRenderMode() is returning e_clip_text
which means that text does not have any intrinsic color). The apparent
text color was coming from clipped images that were drawn on top of
the text.
As a simple work-around for this issue you may want to use PDFDraw
class to convert PDF pages to raster images when you detect that the
page contains text that is used as a clipping path. There are probably
other ways to deal with clipping text but they may involve more
programming on your part.
The modified ElementSample project illustrating this functionality is
as follows:
//
// PDFNet Copyright (c) 2001-2007 by PDFTron Systems Inc. All Rights
Reserved.
//
using System;
using pdftron;
using pdftron.Common;
using pdftron.Filters;
using pdftron.SDF;
using pdftron.PDF;
namespace ElementEditTestCS
{
/// <summary>
/// The sample code shows how to edit the page display list and how
to modify graphics state
/// attributes on existing Elements. In particular the sample program
strips all images from
/// the page and changes text color to blue.
/// </summary>
class ElementEditTest
{
/// <summary>
/// The main entry point for the application.
/// </summary>
[STAThread]
static void Main(string[] args)
{
PDFNet.Initialize();
PDFNet.SetResourcesPath("../../../../../resources");
// Relative path to the folder containing test files.
string input_path = "../../../../TestFiles/";
string output_path = "../../../../TestFiles/Output/";
try
{
Console.WriteLine("-------------------------------------------------");
// Open the test file
Console.WriteLine("Opening the input file...");
PDFDoc doc = new PDFDoc(input_path + "1.pdf");
doc.InitSecurityHandler();
int num_pages = doc.GetPagesCount();
ElementWriter writer = new ElementWriter();
ElementReader reader = new ElementReader();
Element element;
for (int i = 1; i <= num_pages; ++i)
{
PageIterator itr = doc.PageFind(i);
Page page = itr.Current();
reader.Begin(page);
Page new_page = doc.PageCreate();
new_page.SetRotation(Page.Rotate.e_0);
doc.PageInsert(itr.Next(), new_page);
writer.Begin(new_page);
while ((element = reader.Next()) != null) // Read page contents
{
// Intention: to keep captured text elements with their original
colors (for the GC page)
// Result: all captured text elements appear in black color,
some have their colors changed.
/* if (element.GetType() == Element.Type.e_text) {
element.GetGState().SetTextRenderMode(GState.TextRenderingMode.e_fill_text);
}
else if (element.GetType() == Element.Type.e_image)
{
Matrix2D m = element.GetGState().GetTransform();
writer.WriteString(m.m_a + " " + m.m_b + " " + m.m_c + " " +
m.m_d + " " + m.m_h + " " + m.m_v + " cm ");
continue;
} */
// Intention: to capture image elements only (for the I page)
// Result: most captured image elements are not displayed in the
output PDF.
/* if (element.GetType() == Element.Type.e_image) {
}
else if (element.GetType() == Element.Type.e_text)
{
Matrix2D m = element.GetGState().GetTransform();
writer.WriteString(m.m_a + " " + m.m_b + " " + m.m_c + " " +
m.m_d + " " + m.m_h + " " + m.m_v + " cm ");
continue;
} */
if (element.GetType() == Element.Type.e_text)
{
// Set all text to blue color.
GState gs = element.GetGState();
gs.SetFillColorSpace(ColorSpace.CreateDeviceRGB());
gs.SetFillColor(new ColorPt(0, 0, 1));
// Change text render mode to FILL
gs.SetTextRenderMode(GState.TextRenderingMode.e_fill_text);
}
else if (element.GetType() == Element.Type.e_image)
{
// Manually output the transform for the image (or form) being
skipped.
Matrix2D m = element.GetGState().GetTransform();
writer.WriteString(m.m_a + " " + m.m_b + " " + m.m_c + " " +
m.m_d + " " + m.m_h + " " + m.m_v + " cm ");
// remove all images
continue;
}
writer.WriteElement(element);
}
writer.End();
reader.End();
new_page.SetMediaBox(page.GetCropBox());
new_page.SetRotation(page.GetRotation());
doc.PageRemove(doc.PageFind(i));
}
doc.Save(output_path + "newsletter_edited.pdf",
Doc.SaveOptions.e_remove_unused);
doc.Close();
Console.WriteLine("Done. Result saved in
newsletter_edited.pdf...");
}
catch (PDFNetException e)
{
Console.WriteLine(e.Message);
}
PDFNet.Terminate();
}
}
}