tesseract  4.1.1
pageres.h
Go to the documentation of this file.
1 /**********************************************************************
2  * File: pageres.h (Formerly page_res.h)
3  * Description: Results classes used by control.c
4  * Author: Phil Cheatle
5  *
6  * (C) Copyright 1992, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #ifndef PAGERES_H
20 #define PAGERES_H
21 
22 #include <cstdint> // for int32_t, int16_t
23 #include <set> // for std::pair
24 #include <vector> // for std::vector
25 #include <sys/types.h> // for int8_t
26 #include "blamer.h" // for BlamerBundle (ptr only), IRR_NUM_REASONS
27 #include "clst.h" // for CLIST_ITERATOR, CLISTIZEH
28 #include "elst.h" // for ELIST_ITERATOR, ELIST_LINK, ELISTIZEH
29 #include "genericvector.h" // for GenericVector, PointerVector (ptr only)
30 #include "matrix.h" // for MATRIX
31 #include "normalis.h" // for DENORM
32 #include "ratngs.h" // for WERD_CHOICE, BLOB_CHOICE (ptr only)
33 #include "rect.h" // for TBOX
34 #include "rejctmap.h" // for REJMAP
35 #include "strngs.h" // for STRING
36 #include "unichar.h" // for UNICHAR_ID, INVALID_UNICHAR_ID
37 #include "unicharset.h" // for UNICHARSET, UNICHARSET::Direction, UNI...
38 #include "werd.h" // for WERD, W_BOL, W_EOL
39 
40 class BLOCK;
41 class BLOCK_LIST;
42 class BLOCK_RES;
43 class ROW;
44 class ROW_RES;
45 class SEAM;
46 class WERD_RES;
47 
48 struct Pix;
49 struct TWERD;
50 
51 template <class R, class A1, class A2> class TessResultCallback2;
52 
53 namespace tesseract {
54  class BoxWord;
55  class Tesseract;
56  struct FontInfo;
57 }
59 
60 /* Forward declarations */
61 
62 class BLOCK_RES;
63 
65 class
66 ROW_RES;
67 
69 class WERD_RES;
70 
72 
73 /*************************************************************************
74  * PAGE_RES - Page results
75  *************************************************************************/
76 class PAGE_RES { // page result
77  public:
78  int32_t char_count;
79  int32_t rej_count;
80  BLOCK_RES_LIST block_res_list;
81  bool rejected;
82  // Updated every time PAGE_RES_IT iterating on this PAGE_RES moves to
83  // the next word. This pointer is not owned by PAGE_RES class.
85  // Sums of blame reasons computed by the blamer.
87  // Debug information about all the misadaptions on this page.
88  // Each BlamerBundle contains an index into this vector, so that words that
89  // caused misadaption could be marked. However, since words could be
90  // deleted/split/merged, the log is stored on the PAGE_RES level.
92 
93  inline void Init() {
94  char_count = 0;
95  rej_count = 0;
96  rejected = false;
97  prev_word_best_choice = nullptr;
98  blame_reasons.init_to_size(IRR_NUM_REASONS, 0);
99  }
100 
101  PAGE_RES() { Init(); } // empty constructor
102 
103  PAGE_RES(bool merge_similar_words,
104  BLOCK_LIST *block_list, // real blocks
105  WERD_CHOICE **prev_word_best_choice_ptr);
106 
107  ~PAGE_RES () = default;
108 };
109 
110 /*************************************************************************
111  * BLOCK_RES - Block results
112  *************************************************************************/
113 
114 class BLOCK_RES:public ELIST_LINK {
115  public:
116  BLOCK * block; // real block
117  int32_t char_count; // chars in block
118  int32_t rej_count; // rejected chars
119  int16_t font_class; //
120  int16_t row_count;
121  float x_height;
122  bool font_assigned; // block already
123  // processed
124 
125  ROW_RES_LIST row_res_list;
126 
127  BLOCK_RES() = default;
128 
129  BLOCK_RES(bool merge_similar_words, BLOCK *the_block); // real block
130 
131  ~BLOCK_RES () = default;
132 };
133 
134 /*************************************************************************
135  * ROW_RES - Row results
136  *************************************************************************/
137 
138 class ROW_RES:public ELIST_LINK {
139  public:
140  ROW * row; // real row
141  int32_t char_count; // chars in block
142  int32_t rej_count; // rejected chars
143  int32_t whole_word_rej_count; // rejs in total rej wds
144  WERD_RES_LIST word_res_list;
145 
146  ROW_RES() = default;
147 
148  ROW_RES(bool merge_similar_words, ROW *the_row); // real row
149 
150  ~ROW_RES() = default;
151 };
152 
153 /*************************************************************************
154  * WERD_RES - Word results
155  *************************************************************************/
157 {
162 };
163 
164 // WERD_RES is a collection of publicly accessible members that gathers
165 // information about a word result.
166 class WERD_RES : public ELIST_LINK {
167  public:
168  // Which word is which?
169  // There are 3 coordinate spaces in use here: a possibly rotated pixel space,
170  // the original image coordinate space, and the BLN space in which the
171  // baseline of a word is at kBlnBaselineOffset, the xheight is kBlnXHeight,
172  // and the x-middle of the word is at 0.
173  // In the rotated pixel space, coordinates correspond to the input image,
174  // but may be rotated about the origin by a multiple of 90 degrees,
175  // and may therefore be negative.
176  // In any case a rotation by denorm.block()->re_rotation() will take them
177  // back to the original image.
178  // The other differences between words all represent different stages of
179  // processing during recognition.
180 
181  // ---------------------------INPUT-------------------------------------
182 
183  // The word is the input C_BLOBs in the rotated pixel space.
184  // word is NOT owned by the WERD_RES unless combination is true.
185  // All the other word pointers ARE owned by the WERD_RES.
186  WERD* word = nullptr; // Input C_BLOB word.
187 
188  // -------------SETUP BY SetupFor*Recognition---READONLY-INPUT------------
189 
190  // The bln_boxes contains the bounding boxes (only) of the input word, in the
191  // BLN space. The lengths of word and bln_boxes
192  // match as they are both before any chopping.
193  // TODO(rays) determine if docqual does anything useful and delete bln_boxes
194  // if it doesn't.
195  tesseract::BoxWord* bln_boxes = nullptr; // BLN input bounding boxes.
196  // The ROW that this word sits in. NOT owned by the WERD_RES.
197  ROW* blob_row = nullptr;
198  // The denorm provides the transformation to get back to the rotated image
199  // coords from the chopped_word/rebuild_word BLN coords, but each blob also
200  // has its own denorm.
201  DENORM denorm; // For use on chopped_word.
202  // Unicharset used by the classifier output in best_choice and raw_choice.
203  const UNICHARSET* uch_set = nullptr; // For converting back to utf8.
204 
205  // ----Initialized by SetupFor*Recognition---BUT OUTPUT FROM RECOGNITION----
206  // ----Setup to a (different!) state expected by the various classifiers----
207  // TODO(rays) Tidy and make more consistent.
208 
209  // The chopped_word is also in BLN space, and represents the fully chopped
210  // character fragments that make up the word.
211  // The length of chopped_word matches length of seam_array + 1 (if set).
212  TWERD* chopped_word = nullptr; // BLN chopped fragments output.
213  // Vector of SEAM* holding chopping points matching chopped_word.
215  // Widths of blobs in chopped_word.
217  // Gaps between blobs in chopped_word. blob_gaps[i] is the gap between
218  // blob i and blob i+1.
220  // Stores the lstm choices of every timestep
221  std::vector<std::vector<std::pair<const char*, float>>> timesteps;
222  // Stores the lstm choices of every timestep segmented by character
223  std::vector<std::vector<std::vector<
224  std::pair<const char*, float>>>> segmented_timesteps;
225  //Symbolchoices aquired during CTC
226  std::vector<std::vector<std::pair<const char*, float>>> CTC_symbol_choices;
227  // Stores if the timestep vector starts with a space
228  bool leading_space = false;
229  // Stores value when the word ends
230  int end = 0;
231  // Ratings matrix contains classifier choices for each classified combination
232  // of blobs. The dimension is the same as the number of blobs in chopped_word
233  // and the leading diagonal corresponds to classifier results of the blobs
234  // in chopped_word. The state_ members of best_choice, raw_choice and
235  // best_choices all correspond to this ratings matrix and allow extraction
236  // of the blob choices for any given WERD_CHOICE.
237  MATRIX* ratings = nullptr; // Owned pointer.
238  // Pointer to the first WERD_CHOICE in best_choices. This is the result that
239  // will be output from Tesseract. Note that this is now a borrowed pointer
240  // and should NOT be deleted.
241  WERD_CHOICE* best_choice = nullptr; // Borrowed pointer.
242  // The best raw_choice found during segmentation search. Differs from the
243  // best_choice by being the best result according to just the character
244  // classifier, not taking any language model information into account.
245  // Unlike best_choice, the pointer IS owned by this WERD_RES.
246  WERD_CHOICE* raw_choice = nullptr; // Owned pointer.
247  // Alternative results found during chopping/segmentation search stages.
248  // Note that being an ELIST, best_choices owns the WERD_CHOICEs.
249  WERD_CHOICE_LIST best_choices;
250 
251  // Truth bounding boxes, text and incorrect choice reason.
253 
254  // --------------OUTPUT FROM RECOGNITION-------------------------------
255  // --------------Not all fields are necessarily set.-------------------
256  // ---best_choice, raw_choice *must* end up set, with a box_word-------
257  // ---In complete output, the number of blobs in rebuild_word matches---
258  // ---the number of boxes in box_word, the number of unichar_ids in---
259  // ---best_choice, the number of ints in best_state, and the number---
260  // ---of strings in correct_text--------------------------------------
261  // ---SetupFake Sets everything to appropriate values if the word is---
262  // ---known to be bad before recognition.------------------------------
263 
264  // The rebuild_word is also in BLN space, but represents the final best
265  // segmentation of the word. Its length is therefore the same as box_word.
266  TWERD* rebuild_word = nullptr; // BLN best segmented word.
267  // The box_word is in the original image coordinate space. It is the
268  // bounding boxes of the rebuild_word, after denormalization.
269  // The length of box_word matches rebuild_word, best_state (if set) and
270  // correct_text (if set), as well as best_choice and represents the
271  // number of classified units in the output.
272  tesseract::BoxWord* box_word = nullptr; // Denormalized output boxes.
273  // The Tesseract that was used to recognize this word. Just a borrowed
274  // pointer. Note: Tesseract's class definition is in a higher-level library.
275  // We avoid introducing a cyclic dependency by not using the Tesseract
276  // within WERD_RES. We are just storing it to provide access to it
277  // for the top-level multi-language controller, and maybe for output of
278  // the recognized language.
279  // tesseract points to data owned elsewhere.
281  // The best_state stores the relationship between chopped_word and
282  // rebuild_word. Each blob[i] in rebuild_word is composed of best_state[i]
283  // adjacent blobs in chopped_word. The seams in seam_array are hidden
284  // within a rebuild_word blob and revealed between them.
285  GenericVector<int> best_state; // Number of blobs in each best blob.
286  // The correct_text is used during training and adaption to carry the
287  // text to the training system without the need for a unicharset. There
288  // is one entry in the vector for each blob in rebuild_word and box_word.
290 
291  // Less-well documented members.
292  // TODO(rays) Add more documentation here.
293  WERD_CHOICE *ep_choice = nullptr; // ep text TODO(rays) delete this.
294  REJMAP reject_map; // best_choice rejects
295  bool tess_failed = false;
296  /*
297  If tess_failed is true, one of the following tests failed when Tess
298  returned:
299  - The outword blob list was not the same length as the best_choice string;
300  - The best_choice string contained ALL blanks;
301  - The best_choice string was zero length
302  */
303  bool tess_accepted = false; // Tess thinks its ok?
304  bool tess_would_adapt = false; // Tess would adapt?
305  bool done = false; // ready for output?
306  bool small_caps = false; // word appears to be small caps
307  bool odd_size = false; // word is bigger than line or leader dots.
308  // The fontinfos are pointers to data owned by the classifier.
309  const FontInfo* fontinfo = nullptr;
310  const FontInfo* fontinfo2 = nullptr;
311  int8_t fontinfo_id_count = 0; // number of votes
312  int8_t fontinfo_id2_count = 0; // number of votes
313  bool guessed_x_ht = true;
314  bool guessed_caps_ht = true;
316  float x_height = 0.0f; // post match estimate
317  float caps_height = 0.0f; // post match estimate
318  float baseline_shift = 0.0f; // post match estimate.
319  // Certainty score for the spaces either side of this word (LSTM mode).
320  // MIN this value with the actual word certainty.
321  float space_certainty = 0.0f;
322 
323  /*
324  To deal with fuzzy spaces we need to be able to combine "words" to form
325  combinations when we suspect that the gap is a non-space. The (new) text
326  ord code generates separate words for EVERY fuzzy gap - flags in the word
327  indicate whether the gap is below the threshold (fuzzy kern) and is thus
328  NOT a real word break by default, or above the threshold (fuzzy space) and
329  this is a real word break by default.
330 
331  The WERD_RES list contains all these words PLUS "combination" words built
332  out of (copies of) the words split by fuzzy kerns. The separate parts have
333  their "part_of_combo" flag set true and should be IGNORED on a default
334  reading of the list.
335 
336  Combination words are FOLLOWED by the sequence of part_of_combo words
337  which they combine.
338  */
339  bool combination = false; //of two fuzzy gap wds
340  bool part_of_combo = false; //part of a combo
341  bool reject_spaces = false; //Reject spacing?
342 
343  WERD_RES() = default;
344 
345  WERD_RES(WERD *the_word) {
346  word = the_word;
347  }
348  // Deep copies everything except the ratings MATRIX.
349  // To get that use deep_copy below.
350  WERD_RES(const WERD_RES& source) : ELIST_LINK(source) {
351  // combination is used in function Clear which is called from operator=.
352  combination = false;
353  *this = source; // see operator=
354  }
355 
356  ~WERD_RES();
357 
358  // Returns the UTF-8 string for the given blob index in the best_choice word,
359  // given that we know whether we are in a right-to-left reading context.
360  // This matters for mirrorable characters such as parentheses. We recognize
361  // characters purely based on their shape on the page, and by default produce
362  // the corresponding unicode for a left-to-right context.
363  const char* BestUTF8(int blob_index, bool in_rtl_context) const {
364  if (blob_index < 0 || best_choice == nullptr ||
365  blob_index >= best_choice->length())
366  return nullptr;
367  UNICHAR_ID id = best_choice->unichar_id(blob_index);
368  if (id < 0 || id >= uch_set->size())
369  return nullptr;
370  UNICHAR_ID mirrored = uch_set->get_mirror(id);
371  if (in_rtl_context && mirrored > 0)
372  id = mirrored;
373  return uch_set->id_to_unichar_ext(id);
374  }
375  // Returns the UTF-8 string for the given blob index in the raw_choice word.
376  const char* RawUTF8(int blob_index) const {
377  if (blob_index < 0 || blob_index >= raw_choice->length())
378  return nullptr;
379  UNICHAR_ID id = raw_choice->unichar_id(blob_index);
380  if (id < 0 || id >= uch_set->size())
381  return nullptr;
382  return uch_set->id_to_unichar(id);
383  }
384 
385  UNICHARSET::Direction SymbolDirection(int blob_index) const {
386  if (best_choice == nullptr ||
387  blob_index >= best_choice->length() ||
388  blob_index < 0)
390  return uch_set->get_direction(best_choice->unichar_id(blob_index));
391  }
392 
393  bool AnyRtlCharsInWord() const {
394  if (uch_set == nullptr || best_choice == nullptr || best_choice->length() < 1)
395  return false;
396  for (int id = 0; id < best_choice->length(); id++) {
397  int unichar_id = best_choice->unichar_id(id);
398  if (unichar_id < 0 || unichar_id >= uch_set->size())
399  continue; // Ignore illegal chars.
401  uch_set->get_direction(unichar_id);
402  if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
404  return true;
405  }
406  return false;
407  }
408 
409  bool AnyLtrCharsInWord() const {
410  if (uch_set == nullptr || best_choice == nullptr || best_choice->length() < 1)
411  return false;
412  for (int id = 0; id < best_choice->length(); id++) {
413  int unichar_id = best_choice->unichar_id(id);
414  if (unichar_id < 0 || unichar_id >= uch_set->size())
415  continue; // Ignore illegal chars.
416  UNICHARSET::Direction dir = uch_set->get_direction(unichar_id);
417  if (dir == UNICHARSET::U_LEFT_TO_RIGHT ||
419  return true;
420  }
421  return false;
422  }
423 
424  // Return whether the blobs in this WERD_RES 0, 1,... come from an engine
425  // that gave us the unichars in reading order (as opposed to strict left
426  // to right).
427  bool UnicharsInReadingOrder() const {
429  }
430 
431  void Clear();
432  void ClearResults();
433  void ClearWordChoices();
434  void ClearRatings();
435 
436  // Deep copies everything except the ratings MATRIX.
437  // To get that use deep_copy below.
438  WERD_RES& operator=(const WERD_RES& source); //from this
439 
440  void CopySimpleFields(const WERD_RES& source);
441 
442  // Initializes a blank (default constructed) WERD_RES from one that has
443  // already been recognized.
444  // Use SetupFor*Recognition afterwards to complete the setup and make
445  // it ready for a retry recognition.
446  void InitForRetryRecognition(const WERD_RES& source);
447 
448  // Sets up the members used in recognition: bln_boxes, chopped_word,
449  // seam_array, denorm. Returns false if
450  // the word is empty and sets up fake results. If use_body_size is
451  // true and row->body_size is set, then body_size will be used for
452  // blob normalization instead of xheight + ascrise. This flag is for
453  // those languages that are using CJK pitch model and thus it has to
454  // be true if and only if tesseract->textord_use_cjk_fp_model is
455  // true.
456  // If allow_detailed_fx is true, the feature extractor will receive fine
457  // precision outline information, allowing smoother features and better
458  // features on low resolution images.
459  // The norm_mode sets the default mode for normalization in absence
460  // of any of the above flags. It should really be a tesseract::OcrEngineMode
461  // but is declared as int for ease of use with tessedit_ocr_engine_mode.
462  // Returns false if the word is empty and sets up fake results.
463  bool SetupForRecognition(const UNICHARSET& unicharset_in,
464  tesseract::Tesseract* tesseract, Pix* pix,
465  int norm_mode,
466  const TBOX* norm_box, bool numeric_mode,
467  bool use_body_size, bool allow_detailed_fx,
468  ROW *row, const BLOCK* block);
469 
470  // Set up the seam array, bln_boxes, best_choice, and raw_choice to empty
471  // accumulators from a made chopped word. We presume the fields are already
472  // empty.
473  void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in);
474 
475  // Sets up the members used in recognition for an empty recognition result:
476  // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
477  void SetupFake(const UNICHARSET& uch);
478 
479  // Set the word as having the script of the input unicharset.
480  void SetupWordScript(const UNICHARSET& unicharset_in);
481 
482  // Sets up the blamer_bundle if it is not null, using the initialized denorm.
483  void SetupBlamerBundle();
484 
485  // Computes the blob_widths and blob_gaps from the chopped_word.
486  void SetupBlobWidthsAndGaps();
487 
488  // Updates internal data to account for a new SEAM (chop) at the given
489  // blob_number. Fixes the ratings matrix and states in the choices, as well
490  // as the blob widths and gaps.
491  void InsertSeam(int blob_number, SEAM* seam);
492 
493  // Returns true if all the word choices except the first have adjust_factors
494  // worse than the given threshold.
495  bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const;
496 
497  // Returns true if the current word is ambiguous (by number of answers or
498  // by dangerous ambigs.)
499  bool IsAmbiguous();
500 
501  // Returns true if the ratings matrix size matches the sum of each of the
502  // segmentation states.
503  bool StatesAllValid();
504 
505  // Prints a list of words found if debug is true or the word result matches
506  // the word_to_debug.
507  void DebugWordChoices(bool debug, const char* word_to_debug);
508 
509  // Prints the top choice along with the accepted/done flags.
510  void DebugTopChoice(const char* msg) const;
511 
512  // Removes from best_choices all choices which are not within a reasonable
513  // range of the best choice.
514  void FilterWordChoices(int debug_level);
515 
516  // Computes a set of distance thresholds used to control adaption.
517  // Compares the best choice for the current word to the best raw choice
518  // to determine which characters were classified incorrectly by the
519  // classifier. Then places a separate threshold into thresholds for each
520  // character in the word. If the classifier was correct, max_rating is placed
521  // into thresholds. If the classifier was incorrect, the mean match rating
522  // (error percentage) of the classifier's incorrect choice minus some margin
523  // is placed into thresholds. This can then be used by the caller to try to
524  // create a new template for the desired class that will classify the
525  // character with a rating better than the threshold value. The match rating
526  // placed into thresholds is never allowed to be below min_rating in order to
527  // prevent trying to make overly tight templates.
528  // min_rating limits how tight to make a template.
529  // max_rating limits how loose to make a template.
530  // rating_margin denotes the amount of margin to put in template.
531  void ComputeAdaptionThresholds(float certainty_scale,
532  float min_rating,
533  float max_rating,
534  float rating_margin,
535  float* thresholds);
536 
537  // Saves a copy of the word_choice if it has the best unadjusted rating.
538  // Returns true if the word_choice was the new best.
539  bool LogNewRawChoice(WERD_CHOICE* word_choice);
540  // Consumes word_choice by adding it to best_choices, (taking ownership) if
541  // the certainty for word_choice is some distance of the best choice in
542  // best_choices, or by deleting the word_choice and returning false.
543  // The best_choices list is kept in sorted order by rating. Duplicates are
544  // removed, and the list is kept no longer than max_num_choices in length.
545  // Returns true if the word_choice is still a valid pointer.
546  bool LogNewCookedChoice(int max_num_choices, bool debug,
547  WERD_CHOICE* word_choice);
548 
549  // Prints a brief list of all the best choices.
550  void PrintBestChoices() const;
551 
552  // Returns the sum of the widths of the blob between start_blob and last_blob
553  // inclusive.
554  int GetBlobsWidth(int start_blob, int last_blob);
555  // Returns the width of a gap between the specified blob and the next one.
556  int GetBlobsGap(int blob_index);
557 
558  // Returns the BLOB_CHOICE corresponding to the given index in the
559  // best choice word taken from the appropriate cell in the ratings MATRIX.
560  // Borrowed pointer, so do not delete. May return nullptr if there is no
561  // BLOB_CHOICE matching the unichar_id at the given index.
562  BLOB_CHOICE* GetBlobChoice(int index) const;
563 
564  // Returns the BLOB_CHOICE_LIST corresponding to the given index in the
565  // best choice word taken from the appropriate cell in the ratings MATRIX.
566  // Borrowed pointer, so do not delete.
567  BLOB_CHOICE_LIST* GetBlobChoices(int index) const;
568 
569  // Moves the results fields from word to this. This takes ownership of all
570  // the data, so src can be destructed.
571  // word1.ConsumeWordResult(word);
572  // delete word;
573  // is simpler and faster than:
574  // word1 = *word;
575  // delete word;
576  // as it doesn't need to copy and reallocate anything.
578 
579  // Replace the best choice and rebuild box word.
580  // choice must be from the current best_choices list.
581  void ReplaceBestChoice(WERD_CHOICE* choice);
582 
583  // Builds the rebuild_word and sets the best_state from the chopped_word and
584  // the best_choice->state.
585  void RebuildBestState();
586 
587  // Copies the chopped_word to the rebuild_word, faking a best_state as well.
588  // Also sets up the output box_word.
589  void CloneChoppedToRebuild();
590 
591  // Sets/replaces the box_word with one made from the rebuild_word.
592  void SetupBoxWord();
593 
594  // Sets up the script positions in the best_choice using the best_choice
595  // to get the unichars, and the unicharset to get the target positions.
596  void SetScriptPositions();
597  // Sets all the blobs in all the words (best choice and alternates) to be
598  // the given position. (When a sub/superscript is recognized as a separate
599  // word, it falls victim to the rule that a whole word cannot be sub or
600  // superscript, so this function overrides that problem.)
602 
603  // Classifies the word with some already-calculated BLOB_CHOICEs.
604  // The choices are an array of blob_count pointers to BLOB_CHOICE,
605  // providing a single classifier result for each blob.
606  // The BLOB_CHOICEs are consumed and the word takes ownership.
607  // The number of blobs in the box_word must match blob_count.
608  void FakeClassifyWord(int blob_count, BLOB_CHOICE** choices);
609 
610  // Creates a WERD_CHOICE for the word using the top choices from the leading
611  // diagonal of the ratings matrix.
612  void FakeWordFromRatings(PermuterType permuter);
613 
614  // Copies the best_choice strings to the correct_text for adaption/training.
616 
617  // Merges 2 adjacent blobs in the result if the permanent callback
618  // class_cb returns other than INVALID_UNICHAR_ID, AND the permanent
619  // callback box_cb is nullptr or returns true, setting the merged blob
620  // result to the class returned from class_cb.
621  // Returns true if anything was merged.
625 
626  // Merges 2 adjacent blobs in the result (index and index+1) and corrects
627  // all the data to account for the change.
628  void MergeAdjacentBlobs(int index);
629 
630  // Callback helper for fix_quotes returns a double quote if both
631  // arguments are quote, otherwise INVALID_UNICHAR_ID.
633  void fix_quotes();
634 
635  // Callback helper for fix_hyphens returns UNICHAR_ID of - if both
636  // arguments are hyphen, otherwise INVALID_UNICHAR_ID.
638  // Callback helper for fix_hyphens returns true if box1 and box2 overlap
639  // (assuming both on the same textline, are in order and a chopped em dash.)
640  bool HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2);
641  void fix_hyphens();
642 
643  // Callback helper for merge_tess_fails returns a space if both
644  // arguments are space, otherwise INVALID_UNICHAR_ID.
646  void merge_tess_fails();
647 
648  // Returns a really deep copy of *src, including the ratings MATRIX.
649  static WERD_RES* deep_copy(const WERD_RES* src) {
650  auto* result = new WERD_RES(*src);
651  // That didn't copy the ratings, but we want a copy if there is one to
652  // begin with.
653  if (src->ratings != nullptr)
654  result->ratings = src->ratings->DeepCopy();
655  return result;
656  }
657 
658  // Copy blobs from word_res onto this word (eliminating spaces between).
659  // Since this may be called bidirectionally OR both the BOL and EOL flags.
660  void copy_on(WERD_RES *word_res) { //from this word
661  word->set_flag(W_BOL, word->flag(W_BOL) || word_res->word->flag(W_BOL));
662  word->set_flag(W_EOL, word->flag(W_EOL) || word_res->word->flag(W_EOL));
663  word->copy_on(word_res->word);
664  }
665 
666  // Returns true if the collection of count pieces, starting at start, are all
667  // natural connected components, ie there are no real chops involved.
668  bool PiecesAllNatural(int start, int count) const;
669 };
670 
671 /*************************************************************************
672  * PAGE_RES_IT - Page results iterator
673  *************************************************************************/
674 
675 class PAGE_RES_IT {
676  public:
677  PAGE_RES * page_res; // page being iterated
678 
679  PAGE_RES_IT() = default;
680 
681  PAGE_RES_IT(PAGE_RES *the_page_res) { // page result
682  page_res = the_page_res;
683  restart_page(); // ready to scan
684  }
685 
686  // Do two PAGE_RES_ITs point at the same word?
687  // This is much cheaper than cmp().
688  bool operator ==(const PAGE_RES_IT &other) const {
689  return word_res == other.word_res && row_res == other.row_res &&
690  block_res == other.block_res;
691  }
692 
693  bool operator !=(const PAGE_RES_IT &other) const {return !(*this == other); }
694 
695  // Given another PAGE_RES_IT to the same page,
696  // this before other: -1
697  // this equal to other: 0
698  // this later than other: 1
699  int cmp(const PAGE_RES_IT &other) const;
700 
702  return start_page(false); // Skip empty blocks.
703  }
705  return start_page(true); // Allow empty blocks.
706  }
707  WERD_RES *start_page(bool empty_ok);
708 
710 
711  // ============ Methods that mutate the underling structures ===========
712  // Note that these methods will potentially invalidate other PAGE_RES_ITs
713  // and are intended to be used only while a single PAGE_RES_IT is active.
714  // This problem needs to be taken into account if these mutation operators
715  // are ever provided to PageIterator or its subclasses.
716 
717  // Inserts the new_word and a corresponding WERD_RES before the current
718  // position. The simple fields of the WERD_RES are copied from clone_res and
719  // the resulting WERD_RES is returned for further setup with best_choice etc.
720  WERD_RES* InsertSimpleCloneWord(const WERD_RES& clone_res, WERD* new_word);
721 
722  // Replaces the current WERD/WERD_RES with the given words. The given words
723  // contain fake blobs that indicate the position of the characters. These are
724  // replaced with real blobs from the current word as much as possible.
726 
727  // Deletes the current WERD_RES and its underlying WERD.
728  void DeleteCurrentWord();
729 
730  // Makes the current word a fuzzy space if not already fuzzy. Updates
731  // corresponding part of combo if required.
732  void MakeCurrentWordFuzzy();
733 
734  WERD_RES *forward() { // Get next word.
735  return internal_forward(false, false);
736  }
737  // Move forward, but allow empty blocks to show as single nullptr words.
739  return internal_forward(false, true);
740  }
741 
742  WERD_RES *forward_paragraph(); // get first word in next non-empty paragraph
743  WERD_RES *forward_block(); // get first word in next non-empty block
744 
745  WERD_RES *prev_word() const { // previous word
746  return prev_word_res;
747  }
748  ROW_RES *prev_row() const { // row of prev word
749  return prev_row_res;
750  }
751  BLOCK_RES *prev_block() const { // block of prev word
752  return prev_block_res;
753  }
754  WERD_RES *word() const { // current word
755  return word_res;
756  }
757  ROW_RES *row() const { // row of current word
758  return row_res;
759  }
760  BLOCK_RES *block() const { // block of cur. word
761  return block_res;
762  }
763  WERD_RES *next_word() const { // next word
764  return next_word_res;
765  }
766  ROW_RES *next_row() const { // row of next word
767  return next_row_res;
768  }
769  BLOCK_RES *next_block() const { // block of next word
770  return next_block_res;
771  }
772  void rej_stat_word(); // for page/block/row
773  void ResetWordIterator();
774 
775  private:
776  WERD_RES *internal_forward(bool new_block, bool empty_ok);
777 
778  WERD_RES * prev_word_res; // previous word
779  ROW_RES *prev_row_res; // row of prev word
780  BLOCK_RES *prev_block_res; // block of prev word
781 
782  WERD_RES *word_res; // current word
783  ROW_RES *row_res; // row of current word
784  BLOCK_RES *block_res; // block of cur. word
785 
786  WERD_RES *next_word_res; // next word
787  ROW_RES *next_row_res; // row of next word
788  BLOCK_RES *next_block_res; // block of next word
789 
790  BLOCK_RES_IT block_res_it; // iterators
791  ROW_RES_IT row_res_it;
792  WERD_RES_IT word_res_it;
793  // Iterators used to get the state of word_res_it for the current word.
794  // Since word_res_it is 2 words further on, this is otherwise hard to do.
795  WERD_RES_IT wr_it_of_current_word;
796  WERD_RES_IT wr_it_of_next_word;
797 };
798 #endif
int UNICHAR_ID
Definition: unichar.h:34
UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1059
GenericVector< int > blame_reasons
Definition: pageres.h:86
bool flag(WERD_FLAGS mask) const
Definition: werd.h:117
void ResetWordIterator()
Definition: pageres.cpp:1523
int GetBlobsWidth(int start_blob, int last_blob)
Definition: pageres.cpp:730
WERD_RES * restart_page_with_empties()
Definition: pageres.h:704
BLOCK_RES * prev_block() const
Definition: pageres.h:751
PermuterType
Definition: ratngs.h:232
const FontInfo * fontinfo2
Definition: pageres.h:310
UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1008
UNICHARSET::Direction SymbolDirection(int blob_index) const
Definition: pageres.h:385
bool combination
Definition: pageres.h:339
void copy_on(WERD *other)
Definition: werd.cpp:221
const FontInfo * fontinfo
Definition: pageres.h:309
Definition: blobs.h:418
void ComputeAdaptionThresholds(float certainty_scale, float min_rating, float max_rating, float rating_margin, float *thresholds)
Definition: pageres.cpp:561
const char * BestUTF8(int blob_index, bool in_rtl_context) const
Definition: pageres.h:363
void CopySimpleFields(const WERD_RES &source)
Definition: pageres.cpp:251
int size() const
Definition: unicharset.h:341
ROW * blob_row
Definition: pageres.h:197
void init_to_size(int size, const T &t)
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:315
float x_height
Definition: pageres.h:316
bool part_of_combo
Definition: pageres.h:340
int length() const
Definition: ratngs.h:293
void PrintBestChoices() const
Definition: pageres.cpp:717
GenericVector< STRING > misadaption_log
Definition: pageres.h:91
REJMAP reject_map
Definition: pageres.h:294
bool PiecesAllNatural(int start, int count) const
Definition: pageres.cpp:1078
WERD_RES * forward_with_empties()
Definition: pageres.h:738
void copy_on(WERD_RES *word_res)
Definition: pageres.h:660
bool guessed_caps_ht
Definition: pageres.h:314
void fix_hyphens()
Definition: pageres.cpp:1047
BLOCK_RES * next_block() const
Definition: pageres.h:769
int8_t fontinfo_id2_count
Definition: pageres.h:312
ROW * row
Definition: pageres.h:140
Definition: matrix.h:578
TWERD * rebuild_word
Definition: pageres.h:266
void MergeAdjacentBlobs(int index)
Definition: pageres.cpp:974
BLOB_CHOICE * GetBlobChoice(int index) const
Definition: pageres.cpp:750
void InitForRetryRecognition(const WERD_RES &source)
Definition: pageres.cpp:277
WERD_CHOICE * raw_choice
Definition: pageres.h:246
ROW_RES * next_row() const
Definition: pageres.h:766
ROW_RES * row() const
Definition: pageres.h:757
std::vector< std::vector< std::vector< std::pair< const char *, float > > > > segmented_timesteps
Definition: pageres.h:224
BlamerBundle * blamer_bundle
Definition: pageres.h:252
BLOB_CHOICE_LIST * GetBlobChoices(int index) const
Definition: pageres.cpp:759
int32_t char_count
Definition: pageres.h:141
float caps_height
Definition: pageres.h:317
void SetupBlobWidthsAndGaps()
Definition: pageres.cpp:400
bool odd_size
Definition: pageres.h:307
void SetupBoxWord()
Definition: pageres.cpp:849
float x_height
Definition: pageres.h:121
PAGE_RES_IT(PAGE_RES *the_page_res)
Definition: pageres.h:681
int32_t whole_word_rej_count
Definition: pageres.h:143
void SetAllScriptPositions(tesseract::ScriptPos position)
Definition: pageres.cpp:865
bool small_caps
Definition: pageres.h:306
bool tess_accepted
Definition: pageres.h:303
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:765
ELISTIZEH(BLOCK_RES) CLISTIZEH(BLOCK_RES) class ROW_RES
bool operator==(const PAGE_RES_IT &other) const
Definition: pageres.h:688
const char * RawUTF8(int blob_index) const
Definition: pageres.h:376
end of line
Definition: werd.h:33
Definition: ocrrow.h:36
BLOCK_RES * block() const
Definition: pageres.h:760
TWERD * chopped_word
Definition: pageres.h:212
bool font_assigned
Definition: pageres.h:122
void FakeClassifyWord(int blob_count, BLOB_CHOICE **choices)
Definition: pageres.cpp:877
WERD_RES()=default
void SetScriptPositions()
Definition: pageres.cpp:858
WERD_RES * prev_word() const
Definition: pageres.h:745
bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const
Definition: pageres.cpp:439
int end
Definition: pageres.h:230
~BLOCK_RES()=default
void merge_tess_fails()
Definition: pageres.cpp:1067
ROW_RES()=default
MATRIX * DeepCopy() const
Definition: matrix.cpp:94
MATRIX * ratings
Definition: pageres.h:237
WERD_RES(WERD *the_word)
Definition: pageres.h:345
tesseract::BoxWord * bln_boxes
Definition: pageres.h:195
void fix_quotes()
Definition: pageres.cpp:1018
void SetupWordScript(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:384
void SetupBlamerBundle()
Definition: pageres.cpp:393
bool HyphenBoxesOverlap(const TBOX &box1, const TBOX &box2)
Definition: pageres.cpp:1041
int GetBlobsGap(int blob_index)
Definition: pageres.cpp:740
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:118
void MakeCurrentWordFuzzy()
Definition: pageres.cpp:1473
WERD_RES * restart_page()
Definition: pageres.h:701
GenericVector< SEAM * > seam_array
Definition: pageres.h:214
bool StatesAllValid()
Definition: pageres.cpp:458
void InsertSeam(int blob_number, SEAM *seam)
Definition: pageres.cpp:418
void DebugTopChoice(const char *msg) const
Definition: pageres.cpp:499
GenericVector< int > blob_widths
Definition: pageres.h:216
bool tess_would_adapt
Definition: pageres.h:304
int16_t font_class
Definition: pageres.h:119
DENORM denorm
Definition: pageres.h:201
void Clear()
Definition: pageres.cpp:1094
WERD_RES * start_page(bool empty_ok)
Definition: pageres.cpp:1500
void RebuildBestState()
Definition: pageres.cpp:808
Definition: ocrblock.h:29
const char * id_to_unichar_ext(UNICHAR_ID id) const
Definition: unicharset.cpp:299
GenericVector< int > best_state
Definition: pageres.h:285
WERD_RES_LIST word_res_list
Definition: pageres.h:144
CRUNCH_MODE
Definition: pageres.h:156
int count(LIST var_list)
Definition: oldlist.cpp:95
WERD_RES & operator=(const WERD_RES &source)
Definition: pageres.cpp:188
bool guessed_x_ht
Definition: pageres.h:313
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
Definition: pageres.cpp:1213
ROW_RES_LIST row_res_list
Definition: pageres.h:125
WERD_RES * forward_paragraph()
Definition: pageres.cpp:1645
PAGE_RES_IT()=default
void ClearRatings()
Definition: pageres.cpp:1137
start of line
Definition: werd.h:32
WERD_RES * next_word() const
Definition: pageres.h:763
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:305
WERD_CHOICE * ep_choice
Definition: pageres.h:293
int16_t row_count
Definition: pageres.h:120
bool AnyRtlCharsInWord() const
Definition: pageres.h:393
WERD_RES * forward_block()
Definition: pageres.cpp:1660
WERD_RES * restart_row()
Definition: pageres.cpp:1630
int32_t rej_count
Definition: pageres.h:142
void CloneChoppedToRebuild()
Definition: pageres.cpp:835
void FilterWordChoices(int debug_level)
Definition: pageres.cpp:513
ROW_RES * prev_row() const
Definition: pageres.h:748
int8_t fontinfo_id_count
Definition: pageres.h:311
void ClearWordChoices()
Definition: pageres.cpp:1129
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:690
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
GenericVector< int > blob_gaps
Definition: pageres.h:219
Definition: seam.h:38
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:604
int32_t char_count
Definition: pageres.h:78
std::vector< std::vector< std::pair< const char *, float > > > timesteps
Definition: pageres.h:221
Definition: rect.h:34
WERD * word
Definition: pageres.h:186
void DebugWordChoices(bool debug, const char *word_to_debug)
Definition: pageres.cpp:480
bool AnyLtrCharsInWord() const
Definition: pageres.h:409
CLISTIZEH(STRING) CLISTIZE(STRING) namespace tesseract
Definition: reject.cpp:51
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:302
void rej_stat_word()
Definition: pageres.cpp:1667
bool tess_failed
Definition: pageres.h:295
UNICHAR_ID BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1030
bool rejected
Definition: pageres.h:81
static WERD_RES * deep_copy(const WERD_RES *src)
Definition: pageres.h:649
bool leading_space
Definition: pageres.h:228
bool UnicharsInReadingOrder() const
Definition: pageres.h:427
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:352
void ClearResults()
Definition: pageres.cpp:1104
float baseline_shift
Definition: pageres.h:318
int32_t rej_count
Definition: pageres.h:118
int32_t char_count
Definition: pageres.h:117
void DeleteCurrentWord()
Definition: pageres.cpp:1440
bool ConditionalBlobMerge(TessResultCallback2< UNICHAR_ID, UNICHAR_ID, UNICHAR_ID > *class_cb, TessResultCallback2< bool, const TBOX &, const TBOX &> *box_cb)
Definition: pageres.cpp:938
int32_t rej_count
Definition: pageres.h:79
void Init()
Definition: pageres.h:93
BLOCK * block
Definition: pageres.h:116
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:620
std::vector< std::vector< std::pair< const char *, float > > > CTC_symbol_choices
Definition: pageres.h:226
~ROW_RES()=default
const UNICHARSET * uch_set
Definition: pageres.h:203
WERD_CHOICE * best_choice
Definition: pageres.h:241
void ReplaceCurrentWord(tesseract::PointerVector< WERD_RES > *words)
Definition: pageres.cpp:1333
int cmp(const PAGE_RES_IT &other) const
Definition: pageres.cpp:1145
WERD_RES * word() const
Definition: pageres.h:754
WERD_RES * forward()
Definition: pageres.h:734
bool done
Definition: pageres.h:305
bool operator!=(const PAGE_RES_IT &other) const
Definition: pageres.h:693
bool unichars_in_script_order() const
Definition: ratngs.h:525
tesseract::BoxWord * box_word
Definition: pageres.h:272
void FakeWordFromRatings(PermuterType permuter)
Definition: pageres.cpp:898
Definition: werd.h:56
WERD_CHOICE_LIST best_choices
Definition: pageres.h:249
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:697
WERD_RES(const WERD_RES &source)
Definition: pageres.h:350
PAGE_RES * page_res
Definition: pageres.h:677
void BestChoiceToCorrectText()
Definition: pageres.cpp:923
void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:343
bool IsAmbiguous()
Definition: pageres.cpp:452
PAGE_RES()
Definition: pageres.h:101
WERD_CHOICE ** prev_word_best_choice
Definition: pageres.h:84
float space_certainty
Definition: pageres.h:321
BLOCK_RES_LIST block_res_list
Definition: pageres.h:80
BLOCK_RES()=default
bool reject_spaces
Definition: pageres.h:341
void ReplaceBestChoice(WERD_CHOICE *choice)
Definition: pageres.cpp:795
GenericVector< STRING > correct_text
Definition: pageres.h:289