tesseract 5.2.0
Loading...
Searching...
No Matches
tesseract::TessPDFRenderer Class Reference

#include <renderer.h>

Inheritance diagram for tesseract::TessPDFRenderer:
tesseract::TessResultRenderer

Public Member Functions

 TessPDFRenderer (const char *outputbase, const char *datadir, bool textonly=false)
 
- Public Member Functions inherited from tesseract::TessResultRenderer
virtual ~TessResultRenderer ()
 
void insert (TessResultRenderer *next)
 
TessResultRenderernext ()
 
bool BeginDocument (const char *title)
 
bool AddImage (TessBaseAPI *api)
 
bool EndDocument ()
 
const char * file_extension () const
 
const char * title () const
 
bool happy () const
 
int imagenum () const
 

Protected Member Functions

bool BeginDocumentHandler () override
 
bool AddImageHandler (TessBaseAPI *api) override
 
bool EndDocumentHandler () override
 
- Protected Member Functions inherited from tesseract::TessResultRenderer
 TessResultRenderer (const char *outputbase, const char *extension)
 
virtual bool BeginDocumentHandler ()
 
virtual bool AddImageHandler (TessBaseAPI *api)=0
 
virtual bool EndDocumentHandler ()
 
void AppendString (const char *s)
 
void AppendData (const char *s, int len)
 

Detailed Description

Renders tesseract output into searchable PDF

Definition at line 215 of file renderer.h.

Constructor & Destructor Documentation

◆ TessPDFRenderer()

tesseract::TessPDFRenderer::TessPDFRenderer ( const char *  outputbase,
const char *  datadir,
bool  textonly = false 
)

Definition at line 183 of file pdfrenderer.cpp.

184 : TessResultRenderer(outputbase, "pdf"), datadir_(datadir) {
185 obj_ = 0;
186 textonly_ = textonly;
187 offsets_.push_back(0);
188}
struct TessResultRenderer TessResultRenderer
Definition: capi.h:59

Member Function Documentation

◆ AddImageHandler()

bool tesseract::TessPDFRenderer::AddImageHandler ( TessBaseAPI api)
overrideprotectedvirtual

Implements tesseract::TessResultRenderer.

Definition at line 804 of file pdfrenderer.cpp.

804 {
805 Pix *pix = api->GetInputImage();
806 const char *filename = api->GetInputName();
807 int ppi = api->GetSourceYResolution();
808 if (!pix || ppi <= 0) {
809 return false;
810 }
811 double width = pixGetWidth(pix) * 72.0 / ppi;
812 double height = pixGetHeight(pix) * 72.0 / ppi;
813
814 std::stringstream xobject;
815 // Use "C" locale (needed for int values larger than 999).
816 xobject.imbue(std::locale::classic());
817 if (!textonly_) {
818 xobject << "/XObject << /Im1 " << (obj_ + 2) << " 0 R >>\n";
819 }
820
821 // PAGE
822 std::stringstream stream;
823 // Use "C" locale (needed for double values width and height).
824 stream.imbue(std::locale::classic());
825 stream.precision(2);
826 stream << std::fixed << obj_
827 << " 0 obj\n"
828 "<<\n"
829 " /Type /Page\n"
830 " /Parent 2 0 R\n" // Pages object
831 " /MediaBox [0 0 "
832 << width << " " << height
833 << "]\n"
834 " /Contents "
835 << (obj_ + 1)
836 << " 0 R\n" // Contents object
837 " /Resources\n"
838 " <<\n"
839 " "
840 << xobject.str() << // Image object
841 " /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n"
842 " /Font << /f-0-0 3 0 R >>\n" // Type0 Font
843 " >>\n"
844 ">>\n"
845 "endobj\n";
846 pages_.push_back(obj_);
847 AppendPDFObject(stream.str().c_str());
848
849 // CONTENTS
850 const std::unique_ptr<char[]> pdftext(GetPDFTextObjects(api, width, height));
851 const size_t pdftext_len = strlen(pdftext.get());
852 size_t len;
853 unsigned char *comp_pdftext =
854 zlibCompress(reinterpret_cast<unsigned char *>(pdftext.get()), pdftext_len, &len);
855 long comp_pdftext_len = len;
856 stream.str("");
857 stream << obj_
858 << " 0 obj\n"
859 "<<\n"
860 " /Length "
861 << comp_pdftext_len
862 << " /Filter /FlateDecode\n"
863 ">>\n"
864 "stream\n";
865 AppendString(stream.str().c_str());
866 long objsize = stream.str().size();
867 AppendData(reinterpret_cast<char *>(comp_pdftext), comp_pdftext_len);
868 objsize += comp_pdftext_len;
869 lept_free(comp_pdftext);
870 const char *b2 =
871 "endstream\n"
872 "endobj\n";
873 AppendString(b2);
874 objsize += strlen(b2);
875 AppendPDFObjectDIY(objsize);
876
877 if (!textonly_) {
878 char *pdf_object = nullptr;
879 int jpg_quality;
880 api->GetIntVariable("jpg_quality", &jpg_quality);
881 if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize, jpg_quality)) {
882 return false;
883 }
884 AppendData(pdf_object, objsize);
885 AppendPDFObjectDIY(objsize);
886 delete[] pdf_object;
887 }
888 return true;
889}
void AppendString(const char *s)
Definition: renderer.cpp:111
void AppendData(const char *s, int len)
Definition: renderer.cpp:118

◆ BeginDocumentHandler()

bool tesseract::TessPDFRenderer::BeginDocumentHandler ( )
overrideprotectedvirtual

Reimplemented from tesseract::TessResultRenderer.

Definition at line 483 of file pdfrenderer.cpp.

483 {
484 AppendPDFObject("%PDF-1.5\n%\xDE\xAD\xBE\xEB\n");
485
486 // CATALOG
487 AppendPDFObject(
488 "1 0 obj\n"
489 "<<\n"
490 " /Type /Catalog\n"
491 " /Pages 2 0 R\n"
492 ">>\nendobj\n");
493
494 // We are reserving object #2 for the /Pages
495 // object, which I am going to create and write
496 // at the end of the PDF file.
497 AppendPDFObject("");
498
499 // TYPE0 FONT
500 AppendPDFObject(
501 "3 0 obj\n"
502 "<<\n"
503 " /BaseFont /GlyphLessFont\n"
504 " /DescendantFonts [ 4 0 R ]\n" // CIDFontType2 font
505 " /Encoding /Identity-H\n"
506 " /Subtype /Type0\n"
507 " /ToUnicode 6 0 R\n" // ToUnicode
508 " /Type /Font\n"
509 ">>\n"
510 "endobj\n");
511
512 // CIDFONTTYPE2
513 std::stringstream stream;
514 // Use "C" locale (needed for int values larger than 999).
515 stream.imbue(std::locale::classic());
516 stream << "4 0 obj\n"
517 "<<\n"
518 " /BaseFont /GlyphLessFont\n"
519 " /CIDToGIDMap 5 0 R\n" // CIDToGIDMap
520 " /CIDSystemInfo\n"
521 " <<\n"
522 " /Ordering (Identity)\n"
523 " /Registry (Adobe)\n"
524 " /Supplement 0\n"
525 " >>\n"
526 " /FontDescriptor 7 0 R\n" // Font descriptor
527 " /Subtype /CIDFontType2\n"
528 " /Type /Font\n"
529 " /DW "
530 << (1000 / kCharWidth)
531 << "\n"
532 ">>\n"
533 "endobj\n";
534 AppendPDFObject(stream.str().c_str());
535
536 // CIDTOGIDMAP
537 const int kCIDToGIDMapSize = 2 * (1 << 16);
538 const std::unique_ptr<unsigned char[]> cidtogidmap(new unsigned char[kCIDToGIDMapSize]);
539 for (int i = 0; i < kCIDToGIDMapSize; i++) {
540 cidtogidmap[i] = (i % 2) ? 1 : 0;
541 }
542 size_t len;
543 unsigned char *comp = zlibCompress(cidtogidmap.get(), kCIDToGIDMapSize, &len);
544 stream.str("");
545 stream << "5 0 obj\n"
546 "<<\n"
547 " /Length "
548 << len
549 << " /Filter /FlateDecode\n"
550 ">>\n"
551 "stream\n";
552 AppendString(stream.str().c_str());
553 long objsize = stream.str().size();
554 AppendData(reinterpret_cast<char *>(comp), len);
555 objsize += len;
556 lept_free(comp);
557 const char *endstream_endobj =
558 "endstream\n"
559 "endobj\n";
560 AppendString(endstream_endobj);
561 objsize += strlen(endstream_endobj);
562 AppendPDFObjectDIY(objsize);
563
564 const char stream2[] =
565 "/CIDInit /ProcSet findresource begin\n"
566 "12 dict begin\n"
567 "begincmap\n"
568 "/CIDSystemInfo\n"
569 "<<\n"
570 " /Registry (Adobe)\n"
571 " /Ordering (UCS)\n"
572 " /Supplement 0\n"
573 ">> def\n"
574 "/CMapName /Adobe-Identify-UCS def\n"
575 "/CMapType 2 def\n"
576 "1 begincodespacerange\n"
577 "<0000> <FFFF>\n"
578 "endcodespacerange\n"
579 "1 beginbfrange\n"
580 "<0000> <FFFF> <0000>\n"
581 "endbfrange\n"
582 "endcmap\n"
583 "CMapName currentdict /CMap defineresource pop\n"
584 "end\n"
585 "end\n";
586
587 // TOUNICODE
588 stream.str("");
589 stream << "6 0 obj\n"
590 "<< /Length "
591 << (sizeof(stream2) - 1)
592 << " >>\n"
593 "stream\n"
594 << stream2
595 << "endstream\n"
596 "endobj\n";
597 AppendPDFObject(stream.str().c_str());
598
599 // FONT DESCRIPTOR
600 stream.str("");
601 stream << "7 0 obj\n"
602 "<<\n"
603 " /Ascent 1000\n"
604 " /CapHeight 1000\n"
605 " /Descent -1\n" // Spec says must be negative
606 " /Flags 5\n" // FixedPitch + Symbolic
607 " /FontBBox [ 0 0 "
608 << (1000 / kCharWidth)
609 << " 1000 ]\n"
610 " /FontFile2 8 0 R\n"
611 " /FontName /GlyphLessFont\n"
612 " /ItalicAngle 0\n"
613 " /StemV 80\n"
614 " /Type /FontDescriptor\n"
615 ">>\n"
616 "endobj\n";
617 AppendPDFObject(stream.str().c_str());
618
619 stream.str("");
620 stream << datadir_.c_str() << "/pdf.ttf";
621 const uint8_t *font;
622 std::ifstream input(stream.str().c_str(), std::ios::in | std::ios::binary);
623 std::vector<unsigned char> buffer(std::istreambuf_iterator<char>(input), {});
624 auto size = buffer.size();
625 if (size) {
626 font = buffer.data();
627 } else {
628#if !defined(NDEBUG)
629 tprintf("Cannot open file \"%s\"!\nUsing internal glyphless font.\n", stream.str().c_str());
630#endif
631 font = pdf_ttf;
632 size = sizeof(pdf_ttf);
633 }
634
635 // FONTFILE2
636 stream.str("");
637 stream << "8 0 obj\n"
638 "<<\n"
639 " /Length "
640 << size
641 << "\n"
642 " /Length1 "
643 << size
644 << "\n"
645 ">>\n"
646 "stream\n";
647 AppendString(stream.str().c_str());
648 objsize = stream.str().size();
649 AppendData(reinterpret_cast<const char *>(font), size);
650 objsize += size;
651 AppendString(endstream_endobj);
652 objsize += strlen(endstream_endobj);
653 AppendPDFObjectDIY(objsize);
654 return true;
655}
void tprintf(const char *format,...)
Definition: tprintf.cpp:41

◆ EndDocumentHandler()

bool tesseract::TessPDFRenderer::EndDocumentHandler ( )
overrideprotectedvirtual

Reimplemented from tesseract::TessResultRenderer.

Definition at line 891 of file pdfrenderer.cpp.

891 {
892 // We reserved the /Pages object number early, so that the /Page
893 // objects could refer to their parent. We finally have enough
894 // information to go fill it in. Using lower level calls to manipulate
895 // the offset record in two spots, because we are placing objects
896 // out of order in the file.
897
898 // PAGES
899 const long int kPagesObjectNumber = 2;
900 offsets_[kPagesObjectNumber] = offsets_.back(); // manipulation #1
901 std::stringstream stream;
902 // Use "C" locale (needed for int values larger than 999).
903 stream.imbue(std::locale::classic());
904 stream << kPagesObjectNumber << " 0 obj\n<<\n /Type /Pages\n /Kids [ ";
905 AppendString(stream.str().c_str());
906 size_t pages_objsize = stream.str().size();
907 for (const auto &page : pages_) {
908 stream.str("");
909 stream << page << " 0 R ";
910 AppendString(stream.str().c_str());
911 pages_objsize += stream.str().size();
912 }
913 stream.str("");
914 stream << "]\n /Count " << pages_.size() << "\n>>\nendobj\n";
915 AppendString(stream.str().c_str());
916 pages_objsize += stream.str().size();
917 offsets_.back() += pages_objsize; // manipulation #2
918
919 // INFO
920 std::string utf16_title = "FEFF"; // byte_order_marker
921 std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(title());
922 char utf16[kMaxBytesPerCodepoint];
923 for (char32 code : unicodes) {
924 if (CodepointToUtf16be(code, utf16)) {
925 utf16_title += utf16;
926 }
927 }
928
929 char *datestr = l_getFormattedDate();
930 stream.str("");
931 stream << obj_
932 << " 0 obj\n"
933 "<<\n"
934 " /Producer (Tesseract "
936 << ")\n"
937 " /CreationDate (D:"
938 << datestr
939 << ")\n"
940 " /Title <"
941 << utf16_title.c_str()
942 << ">\n"
943 ">>\n"
944 "endobj\n";
945 lept_free(datestr);
946 AppendPDFObject(stream.str().c_str());
947 stream.str("");
948 stream << "xref\n0 " << obj_ << "\n0000000000 65535 f \n";
949 AppendString(stream.str().c_str());
950 for (int i = 1; i < obj_; i++) {
951 stream.str("");
952 stream.width(10);
953 stream.fill('0');
954 stream << offsets_[i] << " 00000 n \n";
955 AppendString(stream.str().c_str());
956 }
957 stream.str("");
958 stream << "trailer\n<<\n /Size " << obj_
959 << "\n"
960 " /Root 1 0 R\n" // catalog
961 " /Info "
962 << (obj_ - 1)
963 << " 0 R\n" // info
964 ">>\nstartxref\n"
965 << offsets_.back() << "\n%%EOF\n";
966 AppendString(stream.str().c_str());
967 return true;
968}
signed int char32
static const char * Version()
Definition: baseapi.cpp:241
const char * title() const
Definition: renderer.h:87
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
Definition: unichar.cpp:220

The documentation for this class was generated from the following files: