31 std::stringstream& alto_str) {
32 int left, top, right, bottom;
33 it->
BoundingBox(level, &left, &top, &right, &bottom);
37 int height = bottom - top;
38 int width = right - left;
40 alto_str <<
" HPOS=\"" << hpos <<
"\"";
41 alto_str <<
" VPOS=\"" << vpos <<
"\"";
42 alto_str <<
" WIDTH=\"" << width <<
"\"";
43 alto_str <<
" HEIGHT=\"" << height <<
"\"";
47 alto_str <<
" WC=\"0." << wc <<
"\"";
58 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" 59 "<alto xmlns=\"http://www.loc.gov/standards/alto/ns-v3#\" " 60 "xmlns:xlink=\"http://www.w3.org/1999/xlink\" " 61 "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" " 62 "xsi:schemaLocation=\"http://www.loc.gov/standards/alto/ns-v3# " 63 "http://www.loc.gov/alto/v3/alto-3-0.xsd\">\n" 65 "\t\t<MeasurementUnit>pixel</MeasurementUnit>\n" 66 "\t\t<sourceImageInformation>\n" 73 "\t\t</sourceImageInformation>\n" 74 "\t\t<OCRProcessing ID=\"OCR_0\">\n" 75 "\t\t\t<ocrProcessingStep>\n" 76 "\t\t\t\t<processingSoftware>\n" 77 "\t\t\t\t\t<softwareName>tesseract ");
81 "\t\t\t\t</processingSoftware>\n" 82 "\t\t\t</ocrProcessingStep>\n" 83 "\t\t</OCRProcessing>\n" 95 if (text ==
nullptr)
return false;
130 int lcnt = 0, tcnt = 0, bcnt = 0, wcnt = 0;
138 wchar_t* uni16_str =
new WCHAR[str16_len];
140 uni16_str, str16_len);
141 int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len,
nullptr,
142 0,
nullptr,
nullptr);
143 char* utf8_str =
new char[utf8_len];
144 WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len,
151 std::stringstream alto_str;
153 alto_str.imbue(std::locale::classic());
155 <<
"\t\t<Page WIDTH=\"" <<
rect_width_ <<
"\" HEIGHT=\"" 157 <<
"\" PHYSICAL_IMG_NR=\"" << page_number <<
"\"" 158 <<
" ID=\"page_" << page_number <<
"\">\n" 159 <<
"\t\t\t<PrintSpace HPOS=\"0\" VPOS=\"0\"" 171 alto_str <<
"\t\t\t\t<ComposedBlock ID=\"cblock_" << bcnt <<
"\"";
172 AddBoxToAlto(res_it,
RIL_BLOCK, alto_str);
177 alto_str <<
"\t\t\t\t\t<TextBlock ID=\"block_" << tcnt <<
"\"";
178 AddBoxToAlto(res_it,
RIL_PARA, alto_str);
183 alto_str <<
"\t\t\t\t\t\t<TextLine ID=\"line_" << lcnt <<
"\"";
188 alto_str <<
"\t\t\t\t\t\t\t<String ID=\"string_" << wcnt <<
"\"";
189 AddBoxToAlto(res_it,
RIL_WORD, alto_str);
190 alto_str <<
" CONTENT=\"";
197 int left, top, right, bottom;
201 const std::unique_ptr<const char[]> grapheme(
203 if (grapheme && grapheme[0] != 0) {
204 alto_str <<
HOcrEscape(grapheme.get()).c_str();
213 if (last_word_in_line) {
214 alto_str <<
"\n\t\t\t\t\t\t</TextLine>\n";
220 int width = left - hpos;
221 alto_str <<
"<SP WIDTH=\"" << width <<
"\" VPOS=\"" << vpos
222 <<
"\" HPOS=\"" << hpos <<
"\"/>\n";
225 if (last_word_in_tblock) {
226 alto_str <<
"\t\t\t\t\t</TextBlock>\n";
230 if (last_word_in_cblock) {
231 alto_str <<
"\t\t\t\t</ComposedBlock>\n";
236 alto_str <<
"\t\t\t</PrintSpace>\n" 238 const std::string& text = alto_str.str();
240 char* result =
new char[text.length() + 1];
241 strcpy(result, text.c_str());
bool EndDocumentHandler() override
bool Empty(PageIteratorLevel level) const
const char * title() const
bool Next(PageIteratorLevel level) override
void AppendString(const char *s)
static const char * Version()
bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const
Tesseract * tesseract_
The underlying data object.
const char * string() const
char * GetAltoText(ETEXT_DESC *monitor, int page_number)
ResultIterator * GetIterator()
float Confidence(PageIteratorLevel level) const
virtual char * GetUTF8Text(PageIteratorLevel level) const
STRING * input_file_
Name used by training code.
bool AddImageHandler(TessBaseAPI *api) override
bool IsAtBeginningOf(PageIteratorLevel level) const override
PAGE_RES * page_res_
The page-level data.
bool BeginDocumentHandler() override
int Recognize(ETEXT_DESC *monitor)
TessAltoRenderer(const char *outputbase)
void SetInputName(const char *name)
bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const override
STRING HOcrEscape(const char *text)