tesseract  4.1.1
hocrrenderer.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: hocrrenderer.cpp
3  * Description: Simple API for calling tesseract.
4  * Author: Ray Smith (original code from baseapi.cpp)
5  * Author: Stefan Weil (moved to separate file and cleaned code)
6  *
7  * (C) Copyright 2006, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include <locale> // for std::locale::classic
21 #include <memory> // for std::unique_ptr
22 #include <sstream> // for std::stringstream
23 #include "baseapi.h" // for TessBaseAPI
24 #ifdef _WIN32
25 # include "host.h" // windows.h for MultiByteToWideChar, ...
26 #endif
27 #include "renderer.h"
28 #include "tesseractclass.h" // for Tesseract
29 
30 namespace tesseract {
31 
35 static tesseract::Orientation GetBlockTextOrientation(const PageIterator* it) {
36  tesseract::Orientation orientation;
37  tesseract::WritingDirection writing_direction;
38  tesseract::TextlineOrder textline_order;
39  float deskew_angle;
40  it->Orientation(&orientation, &writing_direction, &textline_order,
41  &deskew_angle);
42  return orientation;
43 }
44 
53 static void AddBaselineCoordsTohOCR(const PageIterator* it,
54  PageIteratorLevel level,
55  std::stringstream& hocr_str) {
56  tesseract::Orientation orientation = GetBlockTextOrientation(it);
57  if (orientation != ORIENTATION_PAGE_UP) {
58  hocr_str << "; textangle " << 360 - orientation * 90;
59  return;
60  }
61 
62  int left, top, right, bottom;
63  it->BoundingBox(level, &left, &top, &right, &bottom);
64 
65  // Try to get the baseline coordinates at this level.
66  int x1, y1, x2, y2;
67  if (!it->Baseline(level, &x1, &y1, &x2, &y2)) return;
68  // Following the description of this field of the hOCR spec, we convert the
69  // baseline coordinates so that "the bottom left of the bounding box is the
70  // origin".
71  x1 -= left;
72  x2 -= left;
73  y1 -= bottom;
74  y2 -= bottom;
75 
76  // Now fit a line through the points so we can extract coefficients for the
77  // equation: y = p1 x + p0
78  if (x1 == x2) {
79  // Problem computing the polynomial coefficients.
80  return;
81  }
82  double p1 = (y2 - y1) / static_cast<double>(x2 - x1);
83  double p0 = y1 - p1 * x1;
84 
85  hocr_str << "; baseline " << round(p1 * 1000.0) / 1000.0 << " "
86  << round(p0 * 1000.0) / 1000.0;
87 }
88 
89 static void AddBoxTohOCR(const ResultIterator* it, PageIteratorLevel level,
90  std::stringstream& hocr_str) {
91  int left, top, right, bottom;
92  it->BoundingBox(level, &left, &top, &right, &bottom);
93  // This is the only place we use double quotes instead of single quotes,
94  // but it may too late to change for consistency
95  hocr_str << " title=\"bbox " << left << " " << top << " " << right << " "
96  << bottom;
97  // Add baseline coordinates & heights for textlines only.
98  if (level == RIL_TEXTLINE) {
99  AddBaselineCoordsTohOCR(it, level, hocr_str);
100  // add custom height measures
101  float row_height, descenders, ascenders; // row attributes
102  it->RowAttributes(&row_height, &descenders, &ascenders);
103  // TODO(rays): Do we want to limit these to a single decimal place?
104  hocr_str << "; x_size " << row_height << "; x_descenders " << -descenders
105  << "; x_ascenders " << ascenders;
106  }
107  hocr_str << "\">";
108 }
109 
119 char* TessBaseAPI::GetHOCRText(int page_number) {
120  return GetHOCRText(nullptr, page_number);
121 }
122 
132 char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
133  if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0))
134  return nullptr;
135 
136  int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, scnt = 1, tcnt = 1, gcnt = 1;
137  int page_id = page_number + 1; // hOCR uses 1-based page numbers.
138  bool para_is_ltr = true; // Default direction is LTR
139  const char* paragraph_lang = nullptr;
140  bool font_info = false;
141  bool hocr_boxes = false;
142  GetBoolVariable("hocr_font_info", &font_info);
143  GetBoolVariable("hocr_char_boxes", &hocr_boxes);
144 
145  if (input_file_ == nullptr) SetInputName(nullptr);
146 
147 #ifdef _WIN32
148  // convert input name from ANSI encoding to utf-8
149  int str16_len =
150  MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, nullptr, 0);
151  wchar_t* uni16_str = new WCHAR[str16_len];
152  str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
153  uni16_str, str16_len);
154  int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr,
155  0, nullptr, nullptr);
156  char* utf8_str = new char[utf8_len];
157  WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len,
158  nullptr, nullptr);
159  *input_file_ = utf8_str;
160  delete[] uni16_str;
161  delete[] utf8_str;
162 #endif
163 
164  std::stringstream hocr_str;
165  // Use "C" locale (needed for double values x_size and x_descenders).
166  hocr_str.imbue(std::locale::classic());
167  // Use 8 digits for double values.
168  hocr_str.precision(8);
169  hocr_str << " <div class='ocr_page'";
170  hocr_str << " id='"
171  << "page_" << page_id << "'";
172  hocr_str << " title='image \"";
173  if (input_file_) {
174  hocr_str << HOcrEscape(input_file_->string()).c_str();
175  } else {
176  hocr_str << "unknown";
177  }
178  hocr_str << "\"; bbox " << rect_left_ << " " << rect_top_ << " "
179  << rect_width_ << " " << rect_height_ << "; ppageno " << page_number
180  << "'>\n";
181 
182  std::unique_ptr<ResultIterator> res_it(GetIterator());
183  while (!res_it->Empty(RIL_BLOCK)) {
184  if (res_it->Empty(RIL_WORD)) {
185  res_it->Next(RIL_WORD);
186  continue;
187  }
188 
189  // Open any new block/paragraph/textline.
190  if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
191  para_is_ltr = true; // reset to default direction
192  hocr_str << " <div class='ocr_carea'"
193  << " id='"
194  << "block_" << page_id << "_" << bcnt << "'";
195  AddBoxTohOCR(res_it.get(), RIL_BLOCK, hocr_str);
196  }
197  if (res_it->IsAtBeginningOf(RIL_PARA)) {
198  hocr_str << "\n <p class='ocr_par'";
199  para_is_ltr = res_it->ParagraphIsLtr();
200  if (!para_is_ltr) {
201  hocr_str << " dir='rtl'";
202  }
203  hocr_str << " id='"
204  << "par_" << page_id << "_" << pcnt << "'";
205  paragraph_lang = res_it->WordRecognitionLanguage();
206  if (paragraph_lang) {
207  hocr_str << " lang='" << paragraph_lang << "'";
208  }
209  AddBoxTohOCR(res_it.get(), RIL_PARA, hocr_str);
210  }
211  if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
212  hocr_str << "\n <span class='";
213  switch (res_it->BlockType()) {
214  case PT_HEADING_TEXT:
215  hocr_str << "ocr_header";
216  break;
217  case PT_PULLOUT_TEXT:
218  hocr_str << "ocr_textfloat";
219  break;
220  case PT_CAPTION_TEXT:
221  hocr_str << "ocr_caption";
222  break;
223  default:
224  hocr_str << "ocr_line";
225  }
226  hocr_str << "' id='"
227  << "line_" << page_id << "_" << lcnt << "'";
228  AddBoxTohOCR(res_it.get(), RIL_TEXTLINE, hocr_str);
229  }
230 
231  // Now, process the word...
232  std::vector<std::vector<std::pair<const char*, float>>>* choiceMap =
233  nullptr;
235 
236  choiceMap = res_it->GetBestLSTMSymbolChoices();
237  }
238  hocr_str << "\n <span class='ocrx_word'"
239  << " id='"
240  << "word_" << page_id << "_" << wcnt << "'";
241  int left, top, right, bottom;
242  bool bold, italic, underlined, monospace, serif, smallcaps;
243  int pointsize, font_id;
244  const char* font_name;
245  res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
246  font_name =
247  res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
248  &serif, &smallcaps, &pointsize, &font_id);
249  hocr_str << " title='bbox " << left << " " << top << " " << right << " "
250  << bottom << "; x_wconf "
251  << static_cast<int>(res_it->Confidence(RIL_WORD));
252  if (font_info) {
253  if (font_name) {
254  hocr_str << "; x_font " << HOcrEscape(font_name).c_str();
255  }
256  hocr_str << "; x_fsize " << pointsize;
257  }
258  hocr_str << "'";
259  const char* lang = res_it->WordRecognitionLanguage();
260  if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) {
261  hocr_str << " lang='" << lang << "'";
262  }
263  switch (res_it->WordDirection()) {
264  // Only emit direction if different from current paragraph direction
265  case DIR_LEFT_TO_RIGHT:
266  if (!para_is_ltr) hocr_str << " dir='ltr'";
267  break;
268  case DIR_RIGHT_TO_LEFT:
269  if (para_is_ltr) hocr_str << " dir='rtl'";
270  break;
271  case DIR_MIX:
272  case DIR_NEUTRAL:
273  default: // Do nothing.
274  break;
275  }
276  hocr_str << ">";
277  bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
278  bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
279  bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
280  if (bold) hocr_str << "<strong>";
281  if (italic) hocr_str << "<em>";
282  do {
283  const std::unique_ptr<const char[]> grapheme(
284  res_it->GetUTF8Text(RIL_SYMBOL));
285  if (grapheme && grapheme[0] != 0) {
286  if (hocr_boxes) {
287  res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
288  hocr_str << "\n <span class='ocrx_cinfo' title='x_bboxes "
289  << left << " " << top << " " << right << " " << bottom
290  << "; x_conf " << res_it->Confidence(RIL_SYMBOL) << "'>";
291  }
292  hocr_str << HOcrEscape(grapheme.get()).c_str();
293  if (hocr_boxes) {
294  hocr_str << "</span>";
295  }
296  }
297  res_it->Next(RIL_SYMBOL);
298  } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
299  if (italic) hocr_str << "</em>";
300  if (bold) hocr_str << "</strong>";
301  // If the lstm choice mode is required it is added here
302  if (tesseract_->lstm_choice_mode == 1 && choiceMap != nullptr) {
303  for (auto timestep : *choiceMap) {
304  hocr_str << "\n <span class='ocrx_cinfo'"
305  << " id='"
306  << "timestep_" << page_id << "_" << wcnt << "_" << tcnt << "'"
307  << ">";
308  for (std::pair<const char*, float> conf : timestep) {
309  hocr_str << "<span class='ocr_glyph'"
310  << " id='"
311  << "choice_" << page_id << "_" << wcnt << "_" << gcnt << "'"
312  << " title='x_confs " << int(conf.second * 100) << "'>"
313  << conf.first << "</span>";
314  gcnt++;
315  }
316  hocr_str << "</span>";
317  tcnt++;
318  }
319  } else if (tesseract_->lstm_choice_mode == 2 && choiceMap != nullptr) {
320  for (auto timestep : *choiceMap) {
321  if (timestep.size() > 0) {
322  hocr_str << "\n <span class='ocrx_cinfo'"
323  << " id='"
324  << "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt
325  << "'>";
326  for (auto & j : timestep) {
327  hocr_str << "<span class='ocr_glyph'"
328  << " id='"
329  << "choice_" << page_id << "_" << wcnt << "_" << gcnt
330  << "'"
331  << " title='x_confs " << int(j.second * 100)
332  << "'>" << j.first << "</span>";
333  gcnt++;
334  }
335  hocr_str << "</span>";
336  tcnt++;
337  }
338  }
339  }
340  // Close ocrx_word.
341  if (hocr_boxes || tesseract_->lstm_choice_mode > 0) {
342  hocr_str << "\n ";
343  }
344  hocr_str << "</span>";
345  tcnt = 1;
346  gcnt = 1;
347  wcnt++;
348  // Close any ending block/paragraph/textline.
349  if (last_word_in_line) {
350  hocr_str << "\n </span>";
351  lcnt++;
352  }
353  if (last_word_in_para) {
354  hocr_str << "\n </p>\n";
355  pcnt++;
356  para_is_ltr = true; // back to default direction
357  }
358  if (last_word_in_block) {
359  hocr_str << " </div>\n";
360  bcnt++;
361  }
362  }
363  hocr_str << " </div>\n";
364 
365  const std::string& text = hocr_str.str();
366  char* result = new char[text.length() + 1];
367  strcpy(result, text.c_str());
368  return result;
369 }
370 
371 /**********************************************************************
372  * HOcr Text Renderer interface implementation
373  **********************************************************************/
374 TessHOcrRenderer::TessHOcrRenderer(const char* outputbase)
375  : TessResultRenderer(outputbase, "hocr") {
376  font_info_ = false;
377 }
378 
379 TessHOcrRenderer::TessHOcrRenderer(const char* outputbase, bool font_info)
380  : TessResultRenderer(outputbase, "hocr") {
381  font_info_ = font_info;
382 }
383 
385  AppendString(
386  "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
387  "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
388  " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
389  "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" "
390  "lang=\"en\">\n <head>\n <title>");
391  AppendString(title());
392  AppendString(
393  "</title>\n"
394  " <meta http-equiv=\"Content-Type\" content=\"text/html;"
395  "charset=utf-8\"/>\n"
396  " <meta name='ocr-system' content='tesseract " PACKAGE_VERSION
397  "' />\n"
398  " <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par"
399  " ocr_line ocrx_word ocrp_wconf");
400  if (font_info_) AppendString(" ocrp_lang ocrp_dir ocrp_font ocrp_fsize");
401  AppendString(
402  "'/>\n"
403  " </head>\n"
404  " <body>\n");
405 
406  return true;
407 }
408 
410  AppendString(" </body>\n</html>\n");
411 
412  return true;
413 }
414 
416  const std::unique_ptr<const char[]> hocr(api->GetHOCRText(imagenum()));
417  if (hocr == nullptr) return false;
418 
419  AppendString(hocr.get());
420 
421  return true;
422 }
423 
424 } // namespace tesseract
bool GetBoolVariable(const char *name, bool *value) const
Definition: baseapi.cpp:306
const char * title() const
Definition: renderer.h:88
void AppendString(const char *s)
Definition: renderer.cpp:102
const char * c_str() const
Definition: strngs.cpp:205
bool AddImageHandler(TessBaseAPI *api) override
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:888
const char * string() const
Definition: strngs.cpp:194
char * GetHOCRText(ETEXT_DESC *monitor, int page_number)
ResultIterator * GetIterator()
Definition: baseapi.cpp:1324
STRING * input_file_
Name used by training code.
Definition: baseapi.h:896
PAGE_RES * page_res_
The page-level data.
Definition: baseapi.h:895
int Recognize(ETEXT_DESC *monitor)
Definition: baseapi.cpp:830
bool EndDocumentHandler() override
void SetInputName(const char *name)
Definition: baseapi.cpp:271
STRING HOcrEscape(const char *text)
Definition: baseapi.cpp:2310
TessHOcrRenderer(const char *outputbase, bool font_info)
bool BeginDocumentHandler() override