I'm trying to extract the text from cropped content(dataURL)

Parthi · December 13, 2021, 10:07am

hi, I’m trying to extract the text from cropped dataURL but I’m facing the issue : Exception:
Message: PDF header not found. The file is not a valid PDF document.
Conditional expression: false
Version : 9.1.0-cd205f5552
Platform : Emscripten
Architecture : Emscripten
Filename : Parser.cpp
Function : SkipHeader
Linenumber : 1163
“”

import React, { useRef, useEffect, useState } from ‘react’;
import WebViewer from ‘@pdftron/webviewer’;
import ‘./App.css’;

const App = () => {
const viewer = useRef(null);
const [crop, setCrop] = useState(null);

useEffect(() => {
WebViewer({
path: ‘/webviewer/lib’,
initialDoc: ‘/files/PDFTRON_about.pdf’,
fullAPI: true,
},
viewer.current,
).then(instance => {
instance.UI.disableElements([‘toolbarGroup-Shapes’]);
instance.UI.disableElements([‘toolbarGroup-View’]);
instance.UI.disableElements([‘toolbarGroup-Annotate’]);
instance.UI.disableElements([‘toolbarGroup-FillAndSign’]);
instance.UI.disableElements([‘toolbarGroup-Forms’]);
instance.UI.disableElements([‘toolbarGroup-Insert’]);

  // const { docViewer, annotManager } = instance;
  const { documentViewer, annotationManager, Tools, PDFNet } = instance.Core;

  instance.setToolMode('CropPage');
  instance.disableElements(['redoButton', 'undoButton']);

  var FitMode = instance.FitMode;
  instance.setFitMode(FitMode.FitWidth);

  const applyCrop = Tools.CropCreateTool.prototype.applyCrop;
  
  Tools.CropCreateTool.prototype.applyCrop = function (e) {
    const filename = documentViewer.getDocument().getFilename();
    const doc = PDFNet.PDFDoc.createFromURL(filename);   // issue is there
    
    const annotation = annotationManager.getAnnotationsList().find(annotation => annotation.ToolName === "CropPage")
    const cropRect = annotation.getRect();
    documentViewer.getDocument().loadCanvasAsync({
      pageNumber: annotation.PageNumber,
      renderRect: cropRect,
      drawComplete: async (canvas, index) => {
        console.log('CROP_DATA', canvas.toDataURL());
      }
    });
    applyCrop.apply(this, arguments);
  };
}).catch((error) => {
  console.log('error', error);
});

}, []);

return (

React sample

);
};

export default App;

awejasonhu · December 14, 2021, 4:25am

Hello Parthi,

Thank you for contacting us regarding WebViewer.

The provided code snippet is generally correct, however, there are two thing you might want to take care of, please see the revised code snippet below:

   const {
    documentViewer,
    annotationManager,
    Tools,
    PDFNet
  } = instance.Core;

  PDFNet.initialize(); // Before using PDFNet, initialization is recommended
  instance.setToolMode('CropPage');
  var FitMode = instance.FitMode;
  instance.setFitMode(FitMode.FitWidth);

  const applyCrop = Tools.CropCreateTool.prototype.applyCrop;
  Tools.CropCreateTool.prototype.applyCrop = function (e) {

    const doc = PDFNet.PDFDoc.createFromURL(**PathToFile**);   // issue is there, the parameter needs to be the **fileURL**, not the file name
    const annotation = annotationManager.getAnnotationsList().find(annotation => annotation.ToolName === "CropPage")
    const cropRect = annotation.getRect();

    documentViewer.getDocument().loadCanvasAsync({
      pageNumber: annotation.PageNumber,
      renderRect: cropRect,
      drawComplete: async (canvas, index) => {
        console.log('CROP_DATA', canvas.toDataURL());
      }
    });
    applyCrop.apply(this, arguments);
  }

I tested it, and it should be working.

Please let me know how this works for you, and if you have any question.

Best Regards,
Jason Hu
Web Development Support Engineer
PDFTron Systems, Inc.

Parthi · December 14, 2021, 2:31pm

Hi, thanks for your support. I’m tried in different way using documentViewer.getDocument().getPDFDoc();

but getting errors like this :
PDFNet.js:1171 Uncaught (in promise) Error: Function TextExtractorLine.isValid recently altered a struct object without yielding. That object is now being accessed by function ‘isValid’. Perhaps a yield statement is required for TextExtractorLine.isValid?
at checkThisYieldFunction (PDFNet.js:1171)
at PDFNet.TextExtractorLine.isValid (PDFNet.js:1021)
at w._callee2$ (App.js:36)
at tryCatch (runtime.js:63)
at Generator.invoke [as _invoke] (runtime.js:282)
at Generator.prototype. [as next] (runtime.js:116)
at asyncGeneratorStep (asyncToGenerator.js:3)
at _next (asyncToGenerator.js:25)

==================================================================
please check below code:

import React, { useRef, useEffect, useState } from ‘react’;
import WebViewer from ‘@pdftron/webviewer’;
import ‘./App.css’;

const App = () => {
const viewer = useRef(null);
const [crop, setCrop] = useState(null);

useEffect(() => {
WebViewer({
path: ‘/webviewer/lib’,
initialDoc: ‘/files/PDFTRON_about.pdf’,
fullAPI: true,
},
viewer.current,
).then(instance => {
instance.UI.disableElements([‘toolbarGroup-Shapes’]);
instance.UI.disableElements([‘toolbarGroup-View’]);
instance.UI.disableElements([‘toolbarGroup-Annotate’]);
instance.UI.disableElements([‘toolbarGroup-FillAndSign’]);
instance.UI.disableElements([‘toolbarGroup-Forms’]);
instance.UI.disableElements([‘toolbarGroup-Insert’]);

  const { documentViewer, annotationManager, Tools, PDFNet } = instance.Core;

  instance.setToolMode('CropPage');
  instance.disableElements(['redoButton', 'undoButton']);

  var FitMode = instance.FitMode;
  instance.setFitMode(FitMode.FitWidth);

  PDFNet.initialize();

  const applyCrop = Tools.CropCreateTool.prototype.applyCrop;

  Tools.CropCreateTool.prototype.applyCrop = async function (e) {
    const annotation = annotationManager.getAnnotationsList().find(annotation => annotation.ToolName === "CropPage");
    const cropRect = annotation.getRect();
    const docView = await documentViewer.getDocument().getPDFDoc();
    const extractPage = await docView.getPage(annotation.PageNumber);
    const txt = await PDFNet.TextExtractor.create();
    const rect = new PDFNet.Rect(cropRect.x1, cropRect.y1, cropRect.x2, cropRect.y2);
    txt.begin(extractPage, rect, annotation.PageNumber); // Read the page.
    

    documentViewer.getDocument().loadCanvasAsync({
      pageNumber: annotation.PageNumber,
      renderRect: cropRect,
      drawComplete: async (canvas, index) => {
        console.log('CROP_DATA', canvas.toDataURL());
      }
    });

    // Extract words one by one.
    let line = await txt.getFirstLine();  //
   let word;
    console.log('line valid', line.isValid());

    for (; (await line.isValid()); line = (await line.getNextLine())) {
      for (word = await line.getFirstWord(); (await word.isValid()); word = (await word.getNextWord())) {
        const textData = await word.getString();
        setCrop(textData);
      }
    }
    applyCrop.apply(this, arguments);
  };
}).catch((error) => {
  console.log('error', error);
});

}, []);

return (

React sample

);
};

export default App;

awejasonhu · December 14, 2021, 10:25pm

Hello,

Thank you for your reply.

console.log('line valid', line.isValid());
Should be as isValid() is an async function
console.log('line valid', await line.isValid());

Please let me know how this works for you, and if you have any question.

Best Regards,
Jason Hu
Web Development Support Engineer
PDFTron Systems, Inc.

Parthi · December 15, 2021, 6:12am

Hi Jason thanks for your support, I’m not able to extract text from cropped content.
please check the below image PDFNet.TextExtractorLine console

please check below code

const { documentViewer, annotationManager, Tools, PDFNet } = instance.Core;

  instance.setToolMode('CropPage');
  instance.disableElements(['redoButton', 'undoButton']);

  var FitMode = instance.FitMode;
  instance.setFitMode(FitMode.FitWidth);

  PDFNet.initialize();

  const applyCrop = Tools.CropCreateTool.prototype.applyCrop;

  Tools.CropCreateTool.prototype.applyCrop = async function (e) {
    await PDFNet.initialize();
    const annotation = annotationManager.getAnnotationsList().find(annotation => annotation.ToolName === "CropPage");
    const cropRect = annotation.getRect(); 
    const docView = await documentViewer.getDocument().getPDFDoc();   
    const extractPage = await docView.getPage(annotation.PageNumber);       
    let txt = await PDFNet.TextExtractor.create();     
    const rect = new PDFNet.Rect(cropRect.x1, cropRect.y1, cropRect.x2, cropRect.y2);
    txt.begin(extractPage, rect); // Read the page.        

    documentViewer.getDocument().loadCanvasAsync({
      pageNumber: annotation.PageNumber,
      renderRect: cropRect,
      drawComplete: async (canvas, index) => {
        console.log('CROP_DATA');
      }
    });

    // Extract words one by one.
    let line = await txt.getFirstLine(); let word;
    console.log('txt',txt);
    console.log('line',line);
    console.log('line valid', await line.isValid());

    for (; (await line.isValid()); line = (await line.getNextLine())) {
      for (word = await line.getFirstWord(); (await word.isValid()); word = (await word.getNextWord())) {
        const textData = await word.getString();
        console.log('textData',textData);
      }
    }
    applyCrop.apply(this, arguments);
  };

awejasonhu · December 21, 2021, 6:23pm

Hello,

It seems like you opened another post:Is it possible to get a cropped data from pdf?

Since there Diego is actively looking into this problem, I will close this one for now.

Best Regards,
Jason Hu
Web Development Support Engineer
PDFTron Systems, Inc.