25 #ifndef TESSERACT_CCMAIN_TESSERACTCLASS_H_ 26 #define TESSERACT_CCMAIN_TESSERACTCLASS_H_ 30 #include "allheaders.h" 34 #ifndef DISABLED_LEGACY_ENGINE 108 class EquationDetect;
110 class LSTMRecognizer;
199 pixDestroy(&pix_binary_);
209 pixDestroy(&pix_grey_);
210 pix_grey_ = grey_pix;
213 return pix_original_;
217 pixDestroy(&pix_original_);
218 pix_original_ = original_pix;
220 for (
int i = 0; i < sub_langs_.size(); ++i) {
221 sub_langs_[i]->set_pix_original(original_pix ? pixClone(original_pix)
234 if (pixGetWidth(pix_original_) ==
ImageWidth()) {
235 return pix_original_;
236 }
else if (pix_grey_ !=
nullptr) {
243 pixDestroy(&pix_thresholds_);
244 pix_thresholds_ = thresholds;
247 return source_resolution_;
250 source_resolution_ = ppi;
253 return pixGetWidth(pix_binary_);
256 return pixGetHeight(pix_binary_);
259 return scaled_color_;
262 return scaled_factor_;
265 scaled_factor_ = factor;
266 scaled_color_ = color;
276 return right_to_left_;
279 return sub_langs_.size();
282 return sub_langs_[index];
288 for (
int i = 0; i < sub_langs_.size(); ++i) {
298 for (
int i = 0; i < sub_langs_.size(); ++i) {
326 TO_BLOCK_LIST* to_blocks, BLOBNBOX_LIST* diacritic_blobs,
330 OSResults* osr, TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix,
331 Pix** music_mask_pix);
341 const STRING& output_basename,
342 BLOCK_LIST* block_list);
355 int end_box,
const BLOCK& block);
363 TBOX* revised_box)
const;
375 const char* word_config,
int pass);
378 const char* word_config,
PAGE_RES* page_res,
386 const TBOX* target_word_box,
const char* word_config,
389 const TBOX* target_word_box,
const char* word_config);
404 bool* make_next_word_fuzzy);
438 STRING* best_str,
float* c2);
444 TBOX& selection_box);
450 const char* lengths);
475 float* leading_certainty,
int* num_rebuilt_trailing,
476 ScriptPos* trailing_pos,
float* trailing_certainty,
float* avg_certainty,
477 float* unlikely_threshold);
479 float leading_certainty,
ScriptPos leading_pos,
480 int num_chopped_trailing,
481 float trailing_certainty,
483 bool* is_good,
int* retry_leading,
484 int* retry_trailing);
486 float certainty_threshold,
int* left_ok,
487 int* right_ok)
const;
515 return init_tesseract(datapath,
nullptr, language, oem,
nullptr, 0,
nullptr,
516 nullptr,
false, &mgr);
536 char** configs,
int configs_size,
553 char** configs,
int configs_size,
556 bool set_only_init_params,
564 #ifndef GRAPHICS_DISABLED 566 #endif // GRAPHICS_DISABLED 590 int16_t
alpha_count(
const char* word,
const char* word_lengths);
632 void dump_words(WERD_RES_LIST& perm, int16_t score, int16_t mode,
639 #ifndef DISABLED_LEGACY_ENGINE 654 int16_t* accepted_match_count);
658 #ifndef DISABLED_LEGACY_ENGINE 699 BLOCK_LIST* block_list);
708 BLOCK_LIST* block_list);
723 const TBOX& box,
const TBOX* next_box,
724 const char* correct_text);
732 const TBOX* next_box,
const char* correct_text);
755 int choices_pos,
int choices_length,
757 int text_index,
float rating,
767 const char* err_msg);
785 "Take segmentation and labeling from box file");
787 "Conversion of word/line box file to char box file");
789 "Generate training data from boxed chars");
791 "Generate more boxes from boxed chars");
793 "Break input into lines and remap boxes if present");
795 "Dump intermediate images made during page segmentation");
797 "Try inverting the image in `LSTMRecognizeWord`");
799 "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block," 800 " 5=line, 6=word, 7=char" 801 " (Values from PageSegMode enum in publictypes.h)");
803 "Which OCR engine(s) to run (Tesseract, LSTM, both). Defaults" 804 " to loading and running the most accurate available.");
806 "Blacklist of chars not to recognize");
809 "List of chars to override tessedit_char_blacklist");
811 "Perform training for ambiguities");
814 "Whether to use the top-line splitting process for Devanagari " 815 "documents while performing page-segmentation.");
818 "Whether to use the top-line splitting process for Devanagari " 819 "documents while performing ocr.");
821 "Write all parameters to the given file.");
823 "Generate and print debug information for adaption");
828 "Exposure value follows this pattern in the image" 829 " filename. The name of the image files are expected" 830 " to be in the form [lang].[fontname].exp[num].tif");
832 "Learn both character fragments (as is done in the" 833 " special low exposure mode) as well as unfragmented" 836 "Each bounding box is assumed to contain ngrams. Only" 837 " learn the ngrams whose outlines overlap horizontally.");
843 "Don't bother with word plausibility");
846 "Add words to the document dictionary");
850 "Enable correction based on the word bigram dictionary.");
852 "Enable single word correction based on the dictionary.");
854 "Amount of debug output for bigram " 857 "Remove and conditionally reassign small outlines when they" 858 " confuse layout analysis, determining diacritics vs noise");
871 "Scaling on certainty diff from Hingepoint");
881 "good_quality_doc lte outline error limit");
885 "Adaptation decision algorithm for tess");
887 "Do minimal rejection on pass 1 output");
895 "Run paragraph detection on the post-text-recognition " 901 "Reduce rejection on good docs");
904 "%rej allowed before rej whole doc");
906 "%rej allowed before rej whole block");
908 "%rej allowed before rej whole row");
910 "Number of row rejects in whole word rejects" 911 "which prevents whole row rejection");
913 "Only rej partially rejected words in block rejection");
915 "Only rej partially rejected words in row rejection");
917 "Use word segmentation quality metric");
919 "Use word segmentation quality metric");
921 "Only preserve wds longer than this");
923 "Apply row rejection to good docs");
925 "rej good doc wd if more than this fraction rejected");
929 "Output data to debug file");
935 "Add coordinates for each character to hocr output");
957 "Don't pot crunch sensible strings");
960 "Don't crunch words with long lower case strings");
962 "Don't crunch words with long lower case strings");
972 "Max allowed deviation of blob top outside of font data");
976 "How many times worse " 977 "certainty does a superscript position glyph need to be for us " 978 "to try classifying it as a char with a different baseline?");
981 "badness do we think sufficient to choose a superscript over " 982 "what we'd thought. For example, a value of 0.6 means we want " 983 "to reduce badness of certainty by 40%");
985 "A superscript scaled down more than this is unbelievably " 986 "small. For example, 0.3 means we expect the font size to " 987 "be no smaller than 30% of the text line font size.");
989 "Maximum top of a character measured as a multiple of x-height " 990 "above the baseline for us to reconsider whether it's a " 993 "Minimum bottom of a character measured as a multiple of " 994 "x-height above the baseline for us to reconsider whether it's " 997 "Write block separators in output");
1004 "Write .box file for LSTM training");
1007 "Write WordStr format .box output file");
1010 "Create PDF with only one invisible text layer");
1014 "Specify minimum characters to try during OSD");
1024 "Make output have exactly one word per WERD");
1026 "Don't reject ANYTHING AT ALL");
1047 "-1 -> All pages, else specific page to process");
1053 "List of languages to load with this one");
1055 "In multilingual mode use params model of the primary language");
1059 "Min acceptable orientation margin");
1063 "Allow feature extractors to see the original outline");
1065 "Only initialize with the config file. Useful if the instance is " 1066 "not going to be used for OCR but say only for layout analysis.");
1070 "Force using vertical text page mode");
1072 "Fraction of textlines deemed vertical to use vertical page " 1075 "Fraction of height used as a minimum gap for aligned blobs.");
1078 "Preserve multiple interword spaces");
1080 "Page separator (default is form feed control character)");
1082 "Allows to include alternative symbols choices in the hOCR " 1084 "Valid input values are 0, 1, 2 and 3. 0 is the default value. " 1085 "With 1 the alternative symbol choices per timestep are included. " 1086 "With 2 the alternative symbol choices are accumulated per " 1089 "Detect music staff and remove intersecting components");
1103 const char* backup_config_file_;
1114 Pix* pix_thresholds_;
1119 int source_resolution_;
1126 bool right_to_left_;
1138 int font_table_size_;
1144 int train_line_page_num_;
1149 #endif // TESSERACT_CCMAIN_TESSERACTCLASS_H_ bool crunch_include_numerals
char * tessedit_char_blacklist
bool tessedit_reject_bad_qual_wds
Pix * scaled_color() const
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
bool tessedit_zero_rejection
char * conflict_set_I_l_1
void ApplyBoxTraining(const STRING &fontname, PAGE_RES *page_res)
int init_tesseract_internal(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
#define STRING_VAR_H(name, val, comment)
double noise_cert_basechar
bool rej_alphas_in_number_perm
void ResetDocumentDictionary()
Textord * mutable_textord()
void blob_feature_display(PAGE_RES *page_res, const TBOX &selection_box)
bool rej_use_tess_accepted
char * applybox_exposure_pattern
bool tessedit_use_reject_spaces
bool tessedit_write_block_separators
bool tessedit_debug_doc_rejection
void set_source_resolution(int ppi)
bool noise_outlines(TWERD *word)
int pageseg_devanagari_split_strategy
void quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
void SetupUniversalFontIds()
void rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
void tilde_crunch(PAGE_RES_IT &page_res_it)
void script_pos_pass(PAGE_RES *page_res)
WordData(BLOCK *block_in, ROW *row_in, WERD_RES *word_res)
bool tessedit_row_rej_good_docs
bool word_blank_and_set_display(PAGE_RES_IT *pr_its)
CRUNCH_MODE word_deletable(WERD_RES *word, int16_t &delete_mode)
void word_char_quality(WERD_RES *word, ROW *row, int16_t *match_count, int16_t *accepted_match_count)
int tessedit_tess_adaption_mode
bool tess_acceptable_word(WERD_RES *word)
bool tessedit_unrej_any_wd
bool word_dumper(PAGE_RES_IT *pr_it)
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
void convert_bad_unlv_chs(WERD_RES *word_res)
char * chs_trailing_punct2
double rej_whole_of_mostly_reject_word_fract
bool tessedit_create_boxfile
double textord_tabfind_aligned_gap_fraction
bool tessedit_display_outwords
void dictionary_correction_pass(PAGE_RES *page_res)
void tess_segment_pass_n(int pass_n, WERD_RES *word)
char * chs_trailing_punct1
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
void flip_hyphens(WERD_RES *word)
bool word_bln_display(PAGE_RES_IT *pr_it)
bool potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word)
bool ConvertStringToUnichars(const char *utf8, GenericVector< UNICHAR_ID > *class_ids)
bool tessedit_write_rep_codes
int AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr)
int16_t worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
Tesseract * get_sub_lang(int index) const
bool word_set_display(PAGE_RES_IT *pr_it)
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
void SearchForText(const GenericVector< BLOB_CHOICE_LIST *> *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation)
double crunch_poor_garbage_cert
int16_t word_blob_quality(WERD_RES *word, ROW *row)
bool write_results_empty_block
bool BelievableSuperscript(bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const
bool tessedit_create_lstmbox
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
void tilde_delete(PAGE_RES_IT &page_res_it)
void reject_I_1_L(WERD_RES *word)
bool tessedit_enable_doc_dict
void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
int crunch_leave_lc_strings
void(Tesseract::*)(const WordData &, WERD_RES **, PointerVector< WERD_RES > *) WordRecognizer
int16_t word_outline_errs(WERD_RES *word)
UNICHAR_ID get_rep_char(WERD_RES *word)
void debug_word(PAGE_RES *page_res, const TBOX &selection_box)
bool check_debug_pt(WERD_RES *word, int location)
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
bool last_char_was_newline
bool crunch_leave_ok_strings
bool unlv_tilde_crunching
void flip_0O(WERD_RES *word)
bool tessedit_dont_rowrej_good_wds
bool textord_tabfind_vertical_text
void SetupWordScripts(BLOCK_LIST *blocks)
void reject_mostly_rejects(WERD_RES *word)
double crunch_poor_garbage_rate
#define double_VAR_H(name, val, comment)
void TidyUp(PAGE_RES *page_res)
bool tessedit_ambigs_training
int16_t safe_dict_word(const WERD_RES *werd_res)
bool acceptable_number_string(const char *s, const char *lengths)
double tessedit_whole_wd_rej_row_percent
void ReSegmentByClassification(PAGE_RES *page_res)
int crunch_long_repetitions
void read_config_file(const char *filename, SetParamConstraint constraint)
bool paragraph_text_based
bool one_ell_conflict(WERD_RES *word_res, bool update_map)
bool tessedit_write_images
double crunch_terrible_rating
void CorrectClassifyWords(PAGE_RES *page_res)
bool crunch_early_merge_tess_fails
void recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box)
void recog_training_segmented(const STRING &fname, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file)
PAGE_RES * ApplyBoxes(const STRING &fname, bool find_segmentation, BLOCK_LIST *block_list)
void AssignDiacriticsToNewBlobs(const GenericVector< C_OUTLINE *> &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< C_BLOB *> *target_blobs)
char * tessedit_write_params_to_file
void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
int paragraph_debug_level
void SetupWordPassN(int pass_n, WordData *word)
bool tessedit_enable_dict_correction
bool tessedit_use_primary_params_model
int16_t count_alphanums(const WERD_CHOICE &word)
void PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr)
bool tessedit_fix_fuzzy_spaces
void PrerecAllWordsPar(const GenericVector< WordData > &words)
void PreenXHeights(BLOCK_LIST *block_list)
int16_t count_outline_errs(char c, int16_t outline_count)
int tessedit_bigram_debug
int fixsp_non_noise_limit
#define BOOL_VAR_H(name, val, comment)
double min_orientation_margin
int num_sub_langs() const
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
void recog_word_recursive(WERD_RES *word)
double tessedit_upper_flip_hyphen
ImageData * GetLineData(const TBOX &line_box, const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, int start_box, int end_box, const BLOCK &block)
int x_ht_acceptance_tolerance
bool tessedit_debug_block_rejection
bool TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row)
double crunch_pot_poor_rate
bool tessedit_create_alto
bool tessedit_create_wordstrbox
void GetSubAndSuperscriptCandidates(const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold)
bool tessedit_train_line_recognizer
WordData(const PAGE_RES_IT &page_res_it)
PointerVector< WERD_RES > lang_words
int tessedit_ocr_engine_mode
double subscript_max_y_top
double crunch_del_high_word
int16_t alpha_count(const char *word, const char *word_lengths)
int16_t failure_count(WERD_RES *word)
bool crunch_leave_accept_strings
WERD_RES * TrySuperscriptSplits(int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing)
char * tessedit_char_whitelist
bool tessedit_debug_fonts
void font_recognition_pass(PAGE_RES *page_res)
bool tessedit_word_for_word
void fix_rep_char(PAGE_RES_IT *page_res_it)
bool textord_tabfind_show_vlines
bool word_contains_non_1_digit(const char *word, const char *word_lengths)
bool tessedit_timing_debug
int SegmentPage(const STRING *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr)
bool ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE *> &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
double crunch_small_outlines_size
float ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift)
bool tessedit_resegment_from_boxes
bool tessedit_enable_bigram_correction
void recognize_page(STRING &image_name)
void classify_word_pass2(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
void bigram_correction_pass(PAGE_RES *page_res)
int32_t adaption_word_number
void dont_allow_1Il(WERD_RES *word)
void SetScaledColor(int factor, Pix *color)
double tessedit_reject_block_percent
void fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res)
bool RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, GenericVector< WordData > *words)
bool tessedit_adaption_debug
double suspect_accept_rating
void set_pix_original(Pix *original_pix)
bool crunch_early_convert_bad_unlv_chs
Dict & getDict() override
bool tessedit_good_quality_unrej
bool tilde_crunch_written
bool tessedit_override_permuter
bool crunch_terrible_garbage
void AssignDiacriticsToOverlappingBlobs(const GenericVector< C_OUTLINE *> &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< bool > *overlapped_any_blob, GenericVector< C_BLOB *> *target_blobs)
int tessedit_preserve_min_wd_len
int16_t fp_eval_word_spacing(WERD_RES_LIST &word_res_list)
bool non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
bool word_display(PAGE_RES_IT *pr_it)
void process_selected_words(PAGE_RES *page_res, TBOX &selection_box, bool(tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it))
void tess_add_doc_word(WERD_CHOICE *word_choice)
void write_results(PAGE_RES_IT &page_res_it, char newline_type, bool force_eol)
bool tessedit_dump_pageseg_images
float blob_noise_score(TBLOB *blob)
bool applybox_learn_ngrams_mode
void SetBlackAndWhitelist()
void SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words)
Pix ** mutable_pix_binary()
bool applybox_learn_chars_and_char_frags_mode
void classify_word_pass1(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
int init_tesseract_lm(const char *arg0, const char *textbase, const char *language, TessdataManager *mgr)
int quality_min_initial_alphas_reqd
int crunch_leave_uc_strings
float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)
double tessedit_lower_flip_hyphen
bool terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
double quality_outline_pc
void split_word(WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
bool preserve_interword_spaces
bool tessedit_prefer_joined_punct
int RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, bool debug, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
void nn_recover_rejects(WERD_RES *word, ROW *row)
bool pageseg_apply_music_mask
void do_re_display(bool(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it))
int16_t count_alphas(const WERD_CHOICE &word)
void recog_word(WERD_RES *word)
int tessedit_image_border
void blamer_pass(PAGE_RES *page_res)
void nn_match_word(WERD_RES *word, ROW *row)
bool recog_interactive(PAGE_RES_IT *pr_it)
PAGE_RES * SetupApplyBoxes(const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
bool tessedit_minimal_rejection
bool tessedit_zero_kelvin_rejection
ImageData * GetRectImage(const TBOX &box, const BLOCK &block, int padding, TBOX *revised_box) const
bool tessedit_test_adaption
bool digit_or_numeric_punct(WERD_RES *word, int char_position)
double tessedit_reject_doc_percent
int multilang_debug_level
const Textord & textord() const
const FCOORD & reskew() const
bool ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box, const char *correct_text)
bool RunOldFixXht(WERD_RES *word, BLOCK *block, ROW *row)
bool non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
bool rej_1Il_use_dict_word
bool init_tesseract_lang_data(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
int debug_fix_space_level
void LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words)
double superscript_scaledown_ratio
void ResetAdaptiveClassifier()
bool tessedit_rejection_debug
void break_noisiest_blob_word(WERD_RES_LIST &words)
bool poly_allow_detailed_fx
bool repeated_nonalphanum_wd(WERD_RES *word, ROW *row)
void dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved)
bool rej_1Il_trust_permuter_type
bool TrainLineRecognizer(const STRING &input_imagename, const STRING &output_basename, BLOCK_LIST *block_list)
void unrej_good_chs(WERD_RES *word, ROW *row)
ColumnFinder * SetupPageSegAndDetectOrientation(PageSegMode pageseg_mode, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr, TO_BLOCK_LIST *to_blocks, Pix **photo_mask_pix, Pix **music_mask_pix)
int crunch_pot_indicators
bool SubAndSuperscriptFix(WERD_RES *word_res)
bool textord_use_cjk_fp_model
bool tessedit_resegment_from_line_boxes
bool tessedit_make_boxes_from_boxes
bool ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX *next_box, const char *correct_text)
void split_and_recog_word(WERD_RES *word)
double crunch_del_low_word
double crunch_pot_poor_cert
Assume a single uniform block of text. (Default.)
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
bool textord_equation_detect
char * tessedit_char_unblacklist
double noise_cert_disjoint
void set_unlv_suspects(WERD_RES *word)
char * numeric_punctuation
double superscript_bettered_certainty
double tessedit_good_doc_still_rowrej_wd
bool tessedit_preserve_row_rej_perfect_wds
void MaximallyChopWord(const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
bool suspect_constrain_1Il
int16_t first_alphanum_index(const char *word, const char *word_lengths)
double superscript_min_y_bottom
int scaled_factor() const
int16_t first_alphanum_offset(const char *word, const char *word_lengths)
bool tessedit_create_hocr
int init_tesseract(const char *datapath, const char *language, OcrEngineMode oem)
bool textord_tabfind_force_vertical_text
double fixsp_small_outlines_size
void set_done(WERD_RES *word, int16_t pass)
int min_characters_to_try
bool tessedit_dump_choices
double suspect_rating_per_ch
void join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
void set_pix_grey(Pix *grey_pix)
void set_word_fonts(WERD_RES *word)
char * tessedit_load_sublangs
bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
double superscript_worse_certainty
FILE * init_recog_training(const STRING &fname)
void TrainFromBoxes(const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, BLOCK_LIST *block_list, DocumentData *training_data)
int16_t doc_good_char_quality
bool TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
double textord_tabfind_vertical_text_ratio
bool enable_noise_removal
float ClassifyBlobPlusOutlines(const GenericVector< bool > &ok_outlines, const GenericVector< C_OUTLINE *> &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str)
int16_t eval_word_spacing(WERD_RES_LIST &word_res_list)
bool tessedit_fix_hyphens
bool word_adaptable(WERD_RES *word, uint16_t mode)
bool tessedit_minimal_rej_pass1
bool tessedit_train_from_boxes
void process_image_event(const SVEvent &event)
int source_resolution() const
double tessedit_reject_row_percent
void pgeditor_main(int width, int height, PAGE_RES *page_res)
bool FindSegmentation(const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res)
GARBAGE_LEVEL garbage_word(WERD_RES *word, bool ok_dict_word)
SVMenuNode * build_menu_new()
#define INT_VAR_H(name, val, comment)
double crunch_del_min_width
bool fixspace_thinks_word_done(WERD_RES *word)
void ambigs_classify_and_output(const char *label, PAGE_RES_IT *pr_it, FILE *output_file)
bool process_cmd_win_event(int32_t cmd_event, char *new_value)
void ParseLanguageString(const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load)
bool right_to_left() const
bool tessedit_preserve_blk_rej_perfect_wds
bool tessedit_init_config_only
void SearchWords(PointerVector< WERD_RES > *words)
int CountMisfitTops(WERD_RES *word_res)
void reject_edge_blobs(WERD_RES *word)
void set_pix_thresholds(Pix *thresholds)
void make_reject_map(WERD_RES *word, ROW *row, int16_t pass)
bool interactive_display_mode
Pix * pix_original() const
char * ok_repeated_ch_non_alphanum_wds
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
int tessedit_pageseg_mode
bool tessedit_dont_blkrej_good_wds
int ocr_devanagari_split_strategy
bool tessedit_debug_quality_metrics
void SetEquationDetect(EquationDetect *detector)