Is it possible to convert Untagged PDF to Tagged PDF?

Q: Is it possible to convert Untagged PDF to Tagged PDF with PDFTRON PDFNet SDK?

A: You can use PDFNet SDK to implement PDF tagging, however there is no single function to automatically generate a tagged PDF.

As a starting point you may want to take a look at LogicalStructure sample project:

Attached is another example of how to tag a PDF. This example is a bit more involved and is using C++. If you need to automatically tag PDFs (as opposed to manual tagging) you could use ‘pdftron.PDF.TextExtractor’ and ‘pdftron.PDF.ElementReader’ as a starting point for the analysis.

// PDFTagTest.cpp …

#include <PDF/PDFNet.h>
#include <PDF/PDFDoc.h>
#include <PDF/ElementReader.h>
#include <PDF/Struct/SElement.h>
#include <PDF/Struct/ContentItem.h>
#include <SDF/NumberTree.h>

#include
#include

using namespace pdftron;
using namespace PDF;
using namespace SDF;
using namespace Struct;
using namespace std;

/**

  • Creates a new SElement.
    /
    SElement SElement_Create(PDFDoc& doc, const char
    struct_type)
    {
    Obj e = doc.CreateIndirectDict();
    e.PutName(“S”, struct_type); // The structure type
    return SElement(e);
    }

/**

  • Inserts the specified kid SElement into this element.
  • @param insert_before The position after which the kid is inserted. If
  • element currently has no kids, insert_before is ignored.
    */
    void SElement_Insert(SElement& element, SElement& kid, int insert_before)
    {
    assert(element.IsValid() && kid.IsValid());

Obj st = element.GetSDFObj();
Obj k = st.FindObj(“K”);
if (!k || !k.IsArray()) {
k = st.PutArray(“K”);
}
assert(k.IsArray());
if (!k.Size()) insert_before = 0;
Obj kid_obj = kid.GetSDFObj();
k.Insert(insert_before, kid_obj);

kid_obj.Put(“P”, st); // Parent
}

/**

  • @param insert_before The position after which the kid is inserted. If
  • element currently has no kids, insert_before is ignored.
    */
    int ContentItem_Create(PDFDoc& doc, SElement& element, Obj page, int insert_before = -1)
    {
    assert(element.IsValid());

Obj parent_obj = element.GetSDFObj();

parent_obj.Put(“Pg”, page);

Obj k = parent_obj.FindObj(“K”);
if (!k || !k.IsArray()) {
k = parent_obj.PutArray(“K”);
}
assert(k.IsArray());
if (insert_before<0) insert_before = k.Size();
if (!k.Size()) insert_before = 0;

STree st = doc.GetStructTree();
if (!st.IsValid()) { assert(false); }
Obj st_obj = st.GetSDFObj();

int struct_parent_idx = -1;
Obj sp_num;
if (sp_num = page.FindObj(“StructParents”)) {
struct_parent_idx = int(sp_num.GetNumber());
}
else {
Obj cnt = st_obj.FindObj(“ParentTreeNextKey”);
if (!cnt || !cnt.IsNumber()) cnt = st_obj.PutNumber(“ParentTreeNextKey”, 0);
struct_parent_idx = (int) cnt.GetNumber();
page.PutNumber(“StructParents”, struct_parent_idx);
cnt.SetNumber(struct_parent_idx+1);
}

Obj pt = st_obj.FindObj(“ParentTree”);
if (!pt || !pt.IsDict()) {
pt = st_obj.PutDict(“ParentTree”);
pt.PutArray(“Nums”);
}

NumberTree nt(pt);
assert(nt.IsValid());

assert(struct_parent_idx>=0);
Obj val = 0;
DictIterator itr = nt.GetIterator(struct_parent_idx);
if (itr.HasNext()) {
val = itr.Value();
}

if (!val || !val.IsArray()) {
val = doc.CreateIndirectArray();
nt.Put(struct_parent_idx, val);
}

int mcid = (int) val.Size();
val.PushBack(parent_obj);

k.InsertNumber(insert_before, mcid);
return mcid;
}

/**

  • Inserts the specified kid element after the given position as a kid of
  • the specified structure tree root.
  • @param insert_before The position after which the kid is inserted. If
  • element currently has no kids, insert_before is ignored.
    */
    void STree_Insert(STree& tree_root, SElement& kid, int insert_before)
    {
    assert(tree_root.IsValid() && kid.IsValid());
    Obj st = tree_root.GetSDFObj();
    Obj k = st.FindObj(“K”);

if (!k) // Note: k can be a dict.
{
k = st.PutArray(“K”);
}
else
{
//TODO: currently, we mandate existing StructTreeRoot is deleted. we need to fix this in
//the future.
return;
}

assert(k.IsArray());
if (!k.Size()) insert_before = 0;
Obj kid_obj = kid.GetSDFObj();
k.Insert(insert_before, kid_obj);

kid_obj.Put(“P”, st); // Parent

kid = tree_root.GetKid(insert_before);
}

STree STree_Create(PDFDoc& doc)
{
STree sroot = doc.GetStructTree();
if (sroot.IsValid()) {
return sroot;
}

// Create a structure tree if it is missing.
Obj s = doc.CreateIndirectDict();
s.PutName(“Type”, “StructTreeRoot”);
s.PutNumber(“ParentTreeNextKey”, 0);

doc.GetRoot().Put(“StructTreeRoot”, s);
return STree(s);
}

int main(int argc, char *argv[])
{
int ret = 0;
PDFNet::Initialize();

// Relative path to the folder containing test files.
string input_path = “…/…/TestFiles/”;
string output_path = “…/…/TestFiles/Output/”;

try // Extract logical structure from a PDF document
{
//–preparation
PDFDoc doc((input_path + “my.pdf”).c_str());
doc.InitSecurityHandler();

//remove the existing structure tree
doc.GetRoot().Erase(“StructTreeRoot”);

Page pg = doc.GetPage(1);
TRN_DispList dl = 0;
REX(TRN_DispListCreate(pg.mp_page, &dl));

//remove the existing structure tree
pg.GetSDFObj().Erase(“StructTreeRoot”);

STree sroot = STree_Create(doc);
bool marked = false;
TRN_Bool result; //tell if there are any graphical elements marked
TRN_Bool intersection_mode = 0; //tag using intersecting (0) or containing(1)
vector rects;
bool reshuffle = true;

//–mark content 1
SElement p1 = SElement_Create(doc, “P”);
STree_Insert(sroot, p1, 0);
int mcid1 = ContentItem_Create(doc, p1, pg.GetSDFObj());
Obj prop1 = doc.CreateIndirectDict();
prop1.PutNumber(“MCID”, mcid1);
rects.clear();
rects.push_back(182); rects.push_back(453);
rects.push_back(227); rects.push_back(464);
REX(TRN_DispListTag(dl, &rects[0], rects.size()/4, “P”, prop1.mp_obj, intersection_mode, reshuffle, &result));
marked |= TBToB(result);

//–output file
if ( marked )
{
REX(TRN_DispListSave(dl, pg.mp_page));

doc.GetRoot().PutDict(“MarkInfo”).PutBool(“Marked”, true);
doc.Save("…/…/TestFiles/tag_out.pdf", pdftron::SDF::SDFDoc::e_linearized, 0);
}

REX(TRN_DispListDestroy(dl));

cout << “\nDone.” << endl;
}
catch(Common::Exception& e)
{
cout << e << endl;
ret = 1;
}
catch(…)
{
cout << “Unknown Exception” << endl;
ret = 1;
}

PDFNet::Terminate();
return ret;
}