50static const char *
const kRLE =
"\u202A";
51static const char *
const kPDF =
"\u202C";
57 reinterpret_cast<ParagraphModel *
>(
static_cast<uintptr_t
>(0xDEAD111F));
59 reinterpret_cast<ParagraphModel *
>(
static_cast<uintptr_t
>(0xDEAD888F));
69static int Epsilon(
int space_pix) {
70 return space_pix * 4 / 5;
73static bool AcceptableRowArgs(
int debug_level,
int min_num_rows,
const char *function_name,
74 const std::vector<RowScratchRegisters> *rows,
int row_start,
76 if (row_start < 0 ||
static_cast<size_t>(row_end) > rows->size() || row_start > row_end) {
77 tprintf(
"Invalid arguments rows[%d, %d) while rows is of size %zu.\n", row_start, row_end,
81 if (row_end - row_start < min_num_rows) {
82 if (debug_level > 1) {
83 tprintf(
"# Too few rows[%d, %d) for %s.\n", row_start, row_end, function_name);
94static void PrintTable(
const std::vector<std::vector<std::string>> &rows,
const char *colsep) {
95 std::vector<int> max_col_widths;
96 for (
const auto &row : rows) {
97 auto num_columns = row.size();
98 for (
size_t c = 0; c < num_columns; c++) {
100 for (
char i : row[c]) {
101 if ((i & 0xC0) != 0x80) {
105 if (c >= max_col_widths.size()) {
106 max_col_widths.push_back(num_unicodes);
108 if (num_unicodes > max_col_widths[c]) {
109 max_col_widths[c] = num_unicodes;
115 std::vector<std::string> col_width_patterns;
116 col_width_patterns.reserve(max_col_widths.size());
117 for (
int max_col_width : max_col_widths) {
118 col_width_patterns.push_back(std::string(
"%-") + std::to_string(max_col_width) +
"s");
121 for (
const auto &row : rows) {
122 for (
unsigned c = 0; c < row.size(); c++) {
126 tprintf(col_width_patterns[c].c_str(), row[c].c_str());
132static std::string RtlEmbed(
const std::string &word,
bool rtlify) {
134 return std::string(kRLE) + word + std::string(kPDF);
140static void PrintDetectorState(
const ParagraphTheory &theory,
141 const std::vector<RowScratchRegisters> &rows) {
142 std::vector<std::vector<std::string>> output;
143 output.emplace_back();
144 output.back().push_back(
"#row");
145 output.back().push_back(
"space");
146 output.back().push_back(
"..");
147 output.back().push_back(
"lword[widthSEL]");
148 output.back().push_back(
"rword[widthSEL]");
150 output.back().push_back(
"text");
152 for (
unsigned i = 0; i < rows.size(); i++) {
153 output.emplace_back();
154 std::vector<std::string> &row = output.back();
155 const RowInfo &ri = *rows[i].ri_;
156 row.push_back(std::to_string(i));
157 row.push_back(std::to_string(ri.average_interword_space));
158 row.emplace_back(ri.has_leaders ?
".." :
" ");
159 row.push_back(RtlEmbed(ri.lword_text, !ri.ltr) +
"[" + std::to_string(ri.lword_box.width()) +
160 (ri.lword_likely_starts_idea ?
"S" :
"s") +
161 (ri.lword_likely_ends_idea ?
"E" :
"e") +
162 (ri.lword_indicates_list_item ?
"L" :
"l") +
"]");
163 row.push_back(RtlEmbed(ri.rword_text, !ri.ltr) +
"[" + std::to_string(ri.rword_box.width()) +
164 (ri.rword_likely_starts_idea ?
"S" :
"s") +
165 (ri.rword_likely_ends_idea ?
"E" :
"e") +
166 (ri.rword_indicates_list_item ?
"L" :
"l") +
"]");
167 rows[i].AppendDebugInfo(theory, row);
168 row.push_back(RtlEmbed(ri.text, !ri.ltr));
170 PrintTable(output,
" ");
172 tprintf(
"Active Paragraph Models:\n");
174 for (
const auto &model : theory.models()) {
175 tprintf(
" %d: %s\n", ++m, model->ToString().c_str());
179static void DebugDump(
bool should_print,
const char *phase,
const ParagraphTheory &theory,
180 const std::vector<RowScratchRegisters> &rows) {
185 PrintDetectorState(theory, rows);
189static void PrintRowRange(
const std::vector<RowScratchRegisters> &rows,
int row_start,
191 tprintf(
"======================================\n");
192 for (
int row = row_start; row < row_end; row++) {
193 tprintf(
"%s\n", rows[row].ri_->text.c_str());
195 tprintf(
"======================================\n");
200static bool IsLatinLetter(
int ch) {
201 return (ch >=
'a' && ch <=
'z') || (ch >=
'A' && ch <=
'Z');
204static bool IsDigitLike(
int ch) {
205 return ch ==
'o' || ch ==
'O' || ch ==
'l' || ch ==
'I';
208static bool IsOpeningPunct(
int ch) {
209 return strchr(
"'\"({[", ch) !=
nullptr;
212static bool IsTerminalPunct(
int ch) {
213 return strchr(
":'\".?!]})", ch) !=
nullptr;
217static const char *SkipChars(
const char *str,
const char *toskip) {
218 while (*str !=
'\0' && strchr(toskip, *str)) {
224static const char *SkipChars(
const char *str,
bool (*skip)(
int)) {
225 while (*str !=
'\0' && skip(*str)) {
231static const char *SkipOne(
const char *str,
const char *toskip) {
232 if (*str !=
'\0' && strchr(toskip, *str)) {
241static bool LikelyListNumeral(
const std::string &word) {
242 const char *kRomans =
"ivxlmdIVXLMD";
243 const char *kDigits =
"012345789";
244 const char *kOpen =
"[{(";
245 const char *kSep =
":;-.,";
246 const char *kClose =
"]})";
248 int num_segments = 0;
249 const char *pos = word.c_str();
250 while (*pos !=
'\0' && num_segments < 3) {
252 const char *numeral_start = SkipOne(SkipOne(pos, kOpen), kOpen);
253 const char *numeral_end = SkipChars(numeral_start, kRomans);
254 if (numeral_end != numeral_start) {
257 numeral_end = SkipChars(numeral_start, kDigits);
258 if (numeral_end == numeral_start) {
260 numeral_end = SkipChars(numeral_start, IsLatinLetter);
261 if (numeral_end - numeral_start != 1) {
269 pos = SkipChars(SkipChars(numeral_end, kClose), kSep);
270 if (pos == numeral_end) {
277static bool LikelyListMark(
const std::string &word) {
278 const char *kListMarks =
"0Oo*.,+.";
279 return word.size() == 1 && strchr(kListMarks, word[0]) !=
nullptr;
283 return LikelyListMark(word) || LikelyListNumeral(word);
289static int UnicodeFor(
const UNICHARSET *u,
const WERD_CHOICE *werd,
unsigned pos) {
290 if (!u || !werd || pos > werd->length()) {
293 return UNICHAR(u->id_to_unichar(werd->unichar_id(pos)), -1).first_uni();
301 : u_(unicharset), word_(word), wordlen_(word->length()) {
320 while (pos < wordlen_ && u_->get_ispunctuation(word_->
unichar_id(pos))) {
327 while (pos < wordlen_ &&
335 const char *kRomans =
"ivxlmdIVXLMD";
336 while (pos < wordlen_) {
337 int ch = UnicodeFor(u_, word_, pos);
338 if (ch >= 0xF0 || strchr(kRomans, ch) ==
nullptr) {
347 while (pos < wordlen_ && u_->get_isalpha(word_->
unichar_id(pos))) {
353static bool LikelyListMarkUnicode(
int ch) {
355 std::string single_ch;
357 return LikelyListMark(single_ch);
382static bool UniLikelyListItem(
const UNICHARSET *u,
const WERD_CHOICE *werd) {
383 if (werd->length() == 1 && LikelyListMarkUnicode(UnicodeFor(u, werd, 0))) {
387 UnicodeSpanSkipper m(u, werd);
388 int num_segments = 0;
390 while (pos < werd->length() && num_segments < 3) {
391 auto numeral_start = m.SkipPunc(pos);
392 if (numeral_start > pos + 1) {
395 auto numeral_end = m.SkipRomans(numeral_start);
396 if (numeral_end == numeral_start) {
397 numeral_end = m.SkipDigits(numeral_start);
398 if (numeral_end == numeral_start) {
400 numeral_end = m.SkipAlpha(numeral_start);
401 if (numeral_end - numeral_start != 1) {
409 pos = m.SkipPunc(numeral_end);
410 if (pos == numeral_end) {
414 return pos == werd->length();
419 if (std::find(vector.begin(), vector.end(), data) == vector.end()) {
420 vector.push_back(data);
432 bool *is_list,
bool *starts_idea,
bool *ends_idea) {
434 *starts_idea =
false;
436 if (utf8.empty() || (werd !=
nullptr && werd->
empty())) {
441 if (unicharset && werd) {
442 if (UniLikelyListItem(unicharset, werd)) {
459 int start_letter = utf8[0];
460 if (IsOpeningPunct(start_letter)) {
463 if (IsTerminalPunct(start_letter)) {
466 if (start_letter >=
'A' && start_letter <=
'Z') {
478 bool *is_list,
bool *starts_idea,
bool *ends_idea) {
480 *starts_idea =
false;
482 if (utf8.empty() || (werd !=
nullptr && werd->
empty())) {
487 if (unicharset && werd) {
488 if (UniLikelyListItem(unicharset, werd)) {
501 int last_letter = utf8[utf8.size() - 1];
502 if (IsOpeningPunct(last_letter) || IsTerminalPunct(last_letter)) {
511 header.emplace_back(
"[lmarg,lind;rind,rmarg]");
512 header.emplace_back(
"model");
516 std::vector<std::string> &dbg)
const {
520 std::string model_string;
524 int model_numbers = 0;
525 for (
const auto &hypothese : hypotheses_) {
526 if (hypothese.model ==
nullptr) {
529 if (model_numbers > 0) {
533 model_string += std::to_string(1 + theory.
IndexOf(hypothese.model));
535 model_string +=
"CrL";
537 model_string +=
"CrR";
541 if (model_numbers == 0) {
545 dbg.push_back(model_string);
557 if (hypotheses_.empty()) {
560 bool has_start =
false;
561 bool has_body =
false;
562 for (
const auto &hypothese : hypotheses_) {
563 switch (hypothese.ty) {
571 tprintf(
"Encountered bad value in hypothesis list: %c\n", hypothese.ty);
575 if (has_start && has_body) {
582 if (hypotheses_.empty()) {
585 bool has_start =
false;
586 bool has_body =
false;
587 for (
const auto &hypothese : hypotheses_) {
588 if (hypothese.model != model) {
591 switch (hypothese.ty) {
599 tprintf(
"Encountered bad value in hypothesis list: %c\n", hypothese.ty);
603 if (has_start && has_body) {
612 tprintf(
"Trying to set a line to be START when it's already BODY.\n");
622 tprintf(
"Trying to set a line to be BODY when it's already START.\n");
632 if (found != hypotheses_.end()) {
633 hypotheses_.erase(found);
640 if (found != hypotheses_.end()) {
641 hypotheses_.erase(found);
646 for (
const auto &hypothese : hypotheses_) {
654 for (
const auto &hypothese : hypotheses_) {
662 for (
const auto &hypothese : hypotheses_) {
663 if (hypothese.model !=
nullptr) {
670 if (hypotheses_.size() != 1 || hypotheses_[0].ty !=
LT_START) {
673 return hypotheses_[0].model;
677 if (hypotheses_.size() != 1 || hypotheses_[0].ty !=
LT_BODY) {
680 return hypotheses_[0].model;
685 if (models.empty()) {
688 for (
int h = hypotheses_.size() - 1; h >= 0; h--) {
689 if (!
contains(models, hypotheses_[h].model)) {
690 hypotheses_.erase(hypotheses_.begin() + h);
707 explicit SimpleClusterer(
int max_cluster_width) : max_cluster_width_(max_cluster_width) {}
709 values_.push_back(value);
712 return values_.size();
717 int max_cluster_width_;
718 std::vector<int> values_;
722static int ClosestCluster(
const std::vector<Cluster> &clusters,
int value) {
723 unsigned best_index = 0;
724 for (
unsigned i = 0; i < clusters.size(); i++) {
725 if (abs(value - clusters[i].center) < abs(value - clusters[best_index].center)) {
734 std::sort(values_.begin(), values_.end());
735 for (
unsigned i = 0; i < values_.size();) {
739 while (++i < values_.size() && values_[i] <= lo + max_cluster_width_) {
742 clusters->push_back(
Cluster((hi + lo) / 2, i - orig_i));
748static void CalculateTabStops(std::vector<RowScratchRegisters> *rows,
int row_start,
int row_end,
749 int tolerance, std::vector<Cluster> *left_tabs,
750 std::vector<Cluster> *right_tabs) {
751 if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end)) {
755 SimpleClusterer initial_lefts(tolerance);
756 SimpleClusterer initial_rights(tolerance);
757 std::vector<Cluster> initial_left_tabs;
758 std::vector<Cluster> initial_right_tabs;
759 for (
int i = row_start; i < row_end; i++) {
760 initial_lefts.Add((*rows)[i].lindent_);
761 initial_rights.Add((*rows)[i].rindent_);
763 initial_lefts.GetClusters(&initial_left_tabs);
764 initial_rights.GetClusters(&initial_right_tabs);
772 SimpleClusterer lefts(tolerance);
773 SimpleClusterer rights(tolerance);
779 int infrequent_enough_to_ignore = 0;
780 if (row_end - row_start >= 8) {
781 infrequent_enough_to_ignore = 1;
783 if (row_end - row_start >= 20) {
784 infrequent_enough_to_ignore = 2;
787 for (
int i = row_start; i < row_end; i++) {
788 int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
789 int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
790 if (initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
791 initial_right_tabs[ridx].count > infrequent_enough_to_ignore) {
792 lefts.Add((*rows)[i].lindent_);
793 rights.Add((*rows)[i].rindent_);
796 lefts.GetClusters(left_tabs);
797 rights.GetClusters(right_tabs);
799 if ((left_tabs->size() == 1 && right_tabs->size() >= 4) ||
800 (right_tabs->size() == 1 && left_tabs->size() >= 4)) {
805 for (
int i = row_start; i < row_end; i++) {
806 int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
807 int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
808 if (!(initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
809 initial_right_tabs[ridx].count > infrequent_enough_to_ignore)) {
810 lefts.Add((*rows)[i].lindent_);
811 rights.Add((*rows)[i].rindent_);
815 lefts.GetClusters(left_tabs);
816 rights.GetClusters(right_tabs);
820 if (left_tabs->size() == 3 && right_tabs->size() >= 4) {
822 for (
int i = left_tabs->size() - 1; i >= 0; i--) {
823 if (to_prune < 0 || (*left_tabs)[i].count < (*left_tabs)[to_prune].count) {
827 if (to_prune >= 0 && (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
828 left_tabs->erase(left_tabs->begin() + to_prune);
831 if (right_tabs->size() == 3 && left_tabs->size() >= 4) {
833 for (
int i = right_tabs->size() - 1; i >= 0; i--) {
834 if (to_prune < 0 || (*right_tabs)[i].count < (*right_tabs)[to_prune].count) {
838 if (to_prune >= 0 && (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
839 right_tabs->erase(right_tabs->begin() + to_prune);
863static void MarkRowsWithModel(std::vector<RowScratchRegisters> *rows,
int row_start,
int row_end,
864 const ParagraphModel *model,
bool ltr,
int eop_threshold) {
865 if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {
868 for (
int row = row_start; row < row_end; row++) {
871 if (valid_first && !valid_body) {
872 (*rows)[row].AddStartLine(model);
873 }
else if (valid_body && !valid_first) {
874 (*rows)[row].AddBodyLine(model);
875 }
else if (valid_body && valid_first) {
876 bool after_eop = (row == row_start);
877 if (row > row_start) {
878 if (eop_threshold > 0) {
880 after_eop = (*rows)[row - 1].rindent_ > eop_threshold;
882 after_eop = (*rows)[row - 1].lindent_ > eop_threshold;
889 (*rows)[row].AddStartLine(model);
891 (*rows)[row].AddBodyLine(model);
915 "Geometry: TabStop cluster tolerance = %d; "
916 "%zu left tabs; %zu right tabs\n",
919 ltr = (*r)[r_start].ri_->ltr;
955 return ClosestCluster(
left_tabs, (*
rows)[i].lindent_) == 0 &&
966 return ::tesseract::FirstWordWouldHaveFit((*
rows)[row_a], (*
rows)[row_b],
just);
973 void Fail(
int min_debug_level,
const char *why)
const {
990 std::vector<RowScratchRegisters> *
rows;
1046 int num_full_rows = 0;
1047 int last_row_full = 0;
1057 if (num_full_rows < 0.7 * num_rows) {
1058 s.
Fail(1,
"Not enough full lines to know which lines start paras.");
1071 if (debug_level > 0) {
1073 "# Not enough variety for clear outline classification. "
1074 "Guessing these are %s aligned based on script.\n",
1075 s.
ltr ?
"left" :
"right");
1083 if (num_rows - 1 == num_full_rows - last_row_full) {
1088 (*s.
rows)[i].AddBodyLine(model);
1134static void GeometricClassify(
int debug_level, std::vector<RowScratchRegisters> *rows,
1135 int row_start,
int row_end, ParagraphTheory *theory) {
1136 if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end)) {
1139 if (debug_level > 1) {
1140 tprintf(
"###############################################\n");
1141 tprintf(
"##### GeometricClassify( rows[%d:%d) ) ####\n", row_start, row_end);
1142 tprintf(
"###############################################\n");
1146 GeometricClassifierState s(debug_level, rows, row_start, row_end);
1148 s.
Fail(2,
"Too much variety for simple outline classification.");
1152 s.
Fail(1,
"Not enough variety for simple outline classification.");
1156 GeometricClassifyThreeTabStopTextBlock(debug_level, s, theory);
1181 int firsts[2] = {0, 0};
1186 bool jam_packed =
true;
1201 int percent0firsts, percent1firsts;
1202 percent0firsts = (100 * firsts[0]) / s.
AlignTabs()[0].count;
1203 percent1firsts = (100 * firsts[1]) / s.
AlignTabs()[1].count;
1206 if ((percent0firsts < 20 && 30 < percent1firsts) || percent0firsts + 30 < percent1firsts) {
1209 }
else if ((percent1firsts < 20 && 30 < percent0firsts) ||
1210 percent1firsts + 30 < percent0firsts) {
1215 if (debug_level > 1) {
1216 tprintf(
"# Cannot determine %s indent likely to start paragraphs.\n",
1217 s.
just == tesseract::JUSTIFICATION_LEFT ?
"left" :
"right");
1218 tprintf(
"# Indent of %d looks like a first line %d%% of the time.\n",
1219 s.
AlignTabs()[0].center, percent0firsts);
1220 tprintf(
"# Indent of %d looks like a first line %d%% of the time.\n",
1221 s.
AlignTabs()[1].center, percent1firsts);
1232 const ParagraphModel *model = theory->AddModel(s.
Model());
1262 MarkRowsWithModel(rows, row_start, row_end, model, s.
ltr, s.
eop_threshold);
1268 for (
const auto &m : *models_) {
1269 if (m->Comparable(model)) {
1274 models_->push_back(m);
1281 for (
size_t r = 0; r < models_->size(); r++) {
1299 int start,
int end)
const {
1300 for (
const auto *model : *models_) {
1309 for (
const auto *model : *models_) {
1318 for (
const auto *m : *models_) {
1330 tprintf(
"ValidFirstLine() should only be called with strong models!\n");
1333 (*rows)[row].rindent_, (*rows)[row].rmargin_);
1339 tprintf(
"ValidBodyLine() should only be called with strong models!\n");
1342 (*rows)[row].rindent_, (*rows)[row].rmargin_);
1348 tprintf(
"CrownCompatible() should only be called with crown models!\n");
1351 auto &row_a = (*rows)[a];
1352 auto &row_b = (*rows)[b];
1354 return NearlyEqual(row_a.rindent_ + row_a.rmargin_, row_b.rindent_ + row_b.rmargin_,
1355 Epsilon(row_a.ri_->average_interword_space));
1357 return NearlyEqual(row_a.lindent_ + row_a.lmargin_, row_b.lindent_ + row_b.lmargin_,
1358 Epsilon(row_a.ri_->average_interword_space));
1365 : theory_(theory), rows_(rows), row_start_(row_start), row_end_(row_end) {
1366 if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {
1371 open_models_.resize(open_models_.size() + row_end - row_start + 2);
1375void ParagraphModelSmearer::CalculateOpenModels(
int row_start,
int row_end) {
1377 if (row_start < row_start_) {
1378 row_start = row_start_;
1380 if (row_end > row_end_) {
1384 for (
int row = (row_start > 0) ? row_start - 1 : row_start; row < row_end; row++) {
1385 if ((*rows_)[row].ri_->num_words == 0) {
1386 OpenModels(row + 1) = no_models;
1389 (*rows_)[row].StartHypotheses(&opened);
1393 for (
auto &m : opened) {
1401 OpenModels(row + 1) = still_open;
1408 CalculateOpenModels(row_start_, row_end_);
1413 for (
int i = row_start_; i < row_end_; i++) {
1423 bool left_align_open =
false;
1424 bool right_align_open =
false;
1425 for (
auto &m : OpenModels(i)) {
1426 switch (m->justification()) {
1428 left_align_open =
true;
1431 right_align_open =
true;
1434 left_align_open = right_align_open =
true;
1442 likely_start =
true;
1444 if ((left_align_open && right_align_open) || (!left_align_open && !right_align_open)) {
1447 }
else if (left_align_open) {
1460 for (
unsigned m = 0; m < OpenModels(i).size(); m++) {
1469 (*rows_)[i - 1].StrongHypotheses(&last_line_models);
1473 for (
auto model : last_line_models) {
1488 for (
auto &all_model : all_models) {
1498 CalculateOpenModels(i + 1, row_end_);
1507static void DiscardUnusedModels(
const std::vector<RowScratchRegisters> &rows,
1510 for (
const auto &row : rows) {
1511 row.StrongHypotheses(&used_models);
1540static void DowngradeWeakestToCrowns(
int debug_level, ParagraphTheory *theory,
1541 std::vector<RowScratchRegisters> *rows) {
1543 for (
int end = rows->size(); end > 0; end = start) {
1545 const ParagraphModel *model =
nullptr;
1546 while (end > 0 && (model = (*rows)[end - 1].UniqueBodyHypothesis()) ==
nullptr) {
1553 while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) {
1556 if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model &&
StrongModel(model) &&
1557 NearlyEqual(model->first_indent(), model->body_indent(), model->tolerance())) {
1562 if (
StrongModel(model) && model->justification() == JUSTIFICATION_CENTER) {
1573 const ParagraphModel *crown_model = model;
1575 if (model->justification() == JUSTIFICATION_LEFT) {
1581 (*rows)[start].SetUnknown();
1582 (*rows)[start].AddStartLine(crown_model);
1583 for (
int row = start + 1; row < end; row++) {
1584 (*rows)[row].SetUnknown();
1585 (*rows)[row].AddBodyLine(crown_model);
1589 DiscardUnusedModels(*rows, theory);
1609 int end,
int percentile) {
1610 if (!AcceptableRowArgs(0, 0, __func__, rows, start, end)) {
1614 int lmin, lmax, rmin, rmax;
1615 lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_;
1616 rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_;
1617 for (
int i = start; i < end; i++) {
1626 STATS lefts(lmin, lmax);
1627 STATS rights(rmin, rmax);
1628 for (
int i = start; i < end; i++) {
1636 int ignorable_left = lefts.
ile(
ClipToRange(percentile, 0, 100) / 100.0);
1637 int ignorable_right = rights.
ile(
ClipToRange(percentile, 0, 100) / 100.0);
1638 for (
int i = start; i < end; i++) {
1640 int ldelta = ignorable_left - sr.
lmargin_;
1643 int rdelta = ignorable_right - sr.
rmargin_;
1650int InterwordSpace(
const std::vector<RowScratchRegisters> &rows,
int row_start,
int row_end) {
1651 if (row_end < row_start + 1) {
1655 (rows[row_start].ri_->lword_box.height() + rows[row_end - 1].ri_->lword_box.height()) / 2;
1657 (rows[row_start].ri_->lword_box.width() + rows[row_end - 1].ri_->lword_box.width()) / 2;
1658 STATS spacing_widths(0, 4 + word_width);
1659 for (
int i = row_start; i < row_end; i++) {
1660 if (rows[i].ri_->num_words > 1) {
1661 spacing_widths.
add(rows[i].ri_->average_interword_space, 1);
1664 int minimum_reasonable_space = word_height / 3;
1665 if (minimum_reasonable_space < 2) {
1666 minimum_reasonable_space = 2;
1668 int median = spacing_widths.
median();
1669 return (median > minimum_reasonable_space) ? median : minimum_reasonable_space;
1681 tprintf(
"Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n");
1683 int available_space;
1705 int available_space = before.
lindent_;
1706 if (before.
rindent_ > available_space) {
1717static bool TextSupportsBreak(
const RowScratchRegisters &before,
const RowScratchRegisters &after) {
1718 if (before.ri_->ltr) {
1719 return before.ri_->rword_likely_ends_idea && after.ri_->lword_likely_starts_idea;
1721 return before.ri_->lword_likely_ends_idea && after.ri_->rword_likely_starts_idea;
1725static bool LikelyParagraphStart(
const RowScratchRegisters &before,
1726 const RowScratchRegisters &after,
1728 return before.ri_->num_words == 0 ||
1737static ParagraphModel InternalParagraphModelByOutline(
1738 const std::vector<RowScratchRegisters> *rows,
int start,
int end,
int tolerance,
1740 int ltr_line_count = 0;
1741 for (
int i = start; i < end; i++) {
1742 ltr_line_count +=
static_cast<int>((*rows)[i].ri_->ltr);
1744 bool ltr = (ltr_line_count >= (end - start) / 2);
1747 if (!AcceptableRowArgs(0, 2, __func__, rows, start, end)) {
1748 return ParagraphModel();
1753 int lmargin = (*rows)[start].lmargin_;
1754 int rmargin = (*rows)[start].rmargin_;
1755 int lmin, lmax, rmin, rmax, cmin, cmax;
1756 lmin = lmax = (*rows)[start + 1].lindent_;
1757 rmin = rmax = (*rows)[start + 1].rindent_;
1759 for (
int i = start + 1; i < end; i++) {
1760 if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) {
1761 tprintf(
"Margins don't match! Software error.\n");
1762 *consistent =
false;
1763 return ParagraphModel();
1767 UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax);
1769 int ldiff = lmax - lmin;
1770 int rdiff = rmax - rmin;
1771 int cdiff = cmax - cmin;
1772 if (rdiff > tolerance && ldiff > tolerance) {
1773 if (cdiff < tolerance * 2) {
1774 if (end - start < 3) {
1775 return ParagraphModel();
1777 return ParagraphModel(JUSTIFICATION_CENTER, 0, 0, 0, tolerance);
1779 *consistent =
false;
1780 return ParagraphModel();
1782 if (end - start < 3) {
1783 return ParagraphModel();
1788 bool body_admits_left_alignment = ldiff < tolerance;
1789 bool body_admits_right_alignment = rdiff < tolerance;
1791 ParagraphModel left_model = ParagraphModel(JUSTIFICATION_LEFT, lmargin, (*rows)[start].lindent_,
1792 (lmin + lmax) / 2, tolerance);
1793 ParagraphModel right_model = ParagraphModel(JUSTIFICATION_RIGHT, rmargin, (*rows)[start].rindent_,
1794 (rmin + rmax) / 2, tolerance);
1798 bool text_admits_left_alignment = ltr || left_model.is_flush();
1799 bool text_admits_right_alignment = !ltr || right_model.is_flush();
1804 if (tolerance < rdiff) {
1805 if (body_admits_left_alignment && text_admits_left_alignment) {
1808 *consistent =
false;
1809 return ParagraphModel();
1811 if (tolerance < ldiff) {
1812 if (body_admits_right_alignment && text_admits_right_alignment) {
1815 *consistent =
false;
1816 return ParagraphModel();
1823 int first_left = (*rows)[start].lindent_;
1824 int first_right = (*rows)[start].rindent_;
1826 if (ltr && body_admits_left_alignment && (first_left < lmin || first_left > lmax)) {
1829 if (!ltr && body_admits_right_alignment && (first_right < rmin || first_right > rmax)) {
1833 *consistent =
false;
1834 return ParagraphModel();
1841static ParagraphModel ParagraphModelByOutline(
int debug_level,
1842 const std::vector<RowScratchRegisters> *rows,
1843 int start,
int end,
int tolerance) {
1844 bool unused_consistent;
1845 ParagraphModel retval =
1846 InternalParagraphModelByOutline(rows, start, end, tolerance, &unused_consistent);
1847 if (debug_level >= 2 && retval.justification() == JUSTIFICATION_UNKNOWN) {
1848 tprintf(
"Could not determine a model for this paragraph:\n");
1849 PrintRowRange(*rows, start, end);
1855bool RowsFitModel(
const std::vector<RowScratchRegisters> *rows,
int start,
int end,
1857 if (!AcceptableRowArgs(0, 1, __func__, rows, start, end)) {
1863 for (
int i = start + 1; i < end; i++) {
1882static void MarkStrongEvidence(std::vector<RowScratchRegisters> *rows,
int row_start,
1885 for (
int i = row_start + 1; i < row_end; i++) {
1886 const RowScratchRegisters &prev = (*rows)[i - 1];
1887 RowScratchRegisters &curr = (*rows)[i];
1890 if (!curr.ri_->rword_likely_starts_idea && !curr.ri_->lword_likely_starts_idea &&
1910 RowScratchRegisters &curr = (*rows)[row_start];
1911 RowScratchRegisters &next = (*rows)[row_start + 1];
1914 (curr.ri_->lword_likely_starts_idea || curr.ri_->rword_likely_starts_idea)) {
1915 curr.SetStartLine();
1919 for (
int i = row_start + 1; i < row_end - 1; i++) {
1920 RowScratchRegisters &prev = (*rows)[i - 1];
1921 RowScratchRegisters &curr = (*rows)[i];
1922 RowScratchRegisters &next = (*rows)[i + 1];
1925 LikelyParagraphStart(prev, curr, j)) {
1926 curr.SetStartLine();
1931 RowScratchRegisters &prev = (*rows)[row_end - 2];
1932 RowScratchRegisters &curr = (*rows)[row_end - 1];
1935 LikelyParagraphStart(prev, curr, j)) {
1936 curr.SetStartLine();
1944static void ModelStrongEvidence(
int debug_level, std::vector<RowScratchRegisters> *rows,
1945 int row_start,
int row_end,
bool allow_flush_models,
1946 ParagraphTheory *theory) {
1947 if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) {
1951 int start = row_start;
1952 while (start < row_end) {
1953 while (start < row_end && (*rows)[start].GetLineType() !=
LT_START) {
1956 if (start >= row_end - 1) {
1960 int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space);
1962 ParagraphModel last_model;
1963 bool next_consistent;
1969 if (end < row_end - 1) {
1970 RowScratchRegisters &next = (*rows)[end];
1975 next_consistent =
false;
1977 if (next_consistent) {
1978 ParagraphModel next_model =
1979 InternalParagraphModelByOutline(rows, start, end + 1, tolerance, &next_consistent);
1980 if (((*rows)[start].ri_->ltr && last_model.justification() == JUSTIFICATION_LEFT &&
1981 next_model.justification() != JUSTIFICATION_LEFT) ||
1982 (!(*rows)[start].ri_->ltr && last_model.justification() == JUSTIFICATION_RIGHT &&
1983 next_model.justification() != JUSTIFICATION_RIGHT)) {
1984 next_consistent =
false;
1986 last_model = next_model;
1988 next_consistent =
false;
1990 }
while (next_consistent && end < row_end);
1994 if (end > start + 1) {
1996 const ParagraphModel *model =
nullptr;
1997 ParagraphModel new_model = ParagraphModelByOutline(
1998 debug_level, rows, start, end, Epsilon(
InterwordSpace(*rows, start, end)));
1999 if (new_model.justification() == JUSTIFICATION_UNKNOWN) {
2001 }
else if (new_model.is_flush()) {
2002 if (end == start + 2) {
2005 }
else if (start == row_start) {
2007 if (new_model.justification() == JUSTIFICATION_LEFT) {
2012 }
else if (allow_flush_models) {
2013 model = theory->AddModel(new_model);
2016 model = theory->AddModel(new_model);
2019 (*rows)[start].AddStartLine(model);
2020 for (
int i = start + 1; i < end; i++) {
2021 (*rows)[i].AddBodyLine(model);
2036static void StrongEvidenceClassify(
int debug_level, std::vector<RowScratchRegisters> *rows,
2037 int row_start,
int row_end, ParagraphTheory *theory) {
2038 if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) {
2042 if (debug_level > 1) {
2043 tprintf(
"#############################################\n");
2044 tprintf(
"# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end);
2045 tprintf(
"#############################################\n");
2049 MarkStrongEvidence(rows, row_start, row_end);
2051 DebugDump(debug_level > 2,
"Initial strong signals.", *theory, *rows);
2054 ModelStrongEvidence(debug_level, rows, row_start, row_end,
false, theory);
2056 DebugDump(debug_level > 2,
"Unsmeared hypotheses.s.", *theory, *rows);
2061 ParagraphModelSmearer smearer(rows, row_start, row_end, theory);
2065static void SeparateSimpleLeaderLines(std::vector<RowScratchRegisters> *rows,
int row_start,
2066 int row_end, ParagraphTheory *theory) {
2067 for (
int i = row_start + 1; i < row_end - 1; i++) {
2068 if ((*rows)[i - 1].ri_->has_leaders && (*rows)[i].ri_->has_leaders &&
2069 (*rows)[i + 1].ri_->has_leaders) {
2070 const ParagraphModel *model =
2071 theory->AddModel(ParagraphModel(JUSTIFICATION_UNKNOWN, 0, 0, 0, 0));
2072 (*rows)[i].AddStartLine(model);
2079static void ConvertHypothesizedModelRunsToParagraphs(
int debug_level,
2080 std::vector<RowScratchRegisters> &rows,
2081 std::vector<PARA *> *row_owners,
2082 ParagraphTheory *theory) {
2083 int end = rows.size();
2085 for (; end > 0; end = start) {
2087 const ParagraphModel *model =
nullptr;
2089 bool single_line_paragraph =
false;
2091 rows[start].NonNullHypotheses(&models);
2092 if (!models.empty()) {
2094 if (rows[start].GetLineType(model) !=
LT_BODY) {
2095 single_line_paragraph =
true;
2098 if (model && !single_line_paragraph) {
2100 while (--start > 0 && rows[start].GetLineType(model) ==
LT_BODY) {
2103 if (start < 0 || rows[start].GetLineType(model) !=
LT_START) {
2107 if (model ==
nullptr) {
2111 PARA *p =
new PARA();
2113 p->is_very_first_or_continuation =
true;
2117 for (
unsigned row = end; row < rows.size(); row++) {
2118 if ((*row_owners)[row] &&
2120 (start == 0 ||
ValidFirstLine(&rows, start, (*row_owners)[row]->model)))) {
2121 model = (*row_owners)[row]->model;
2127 model = theory->AddModel(ParagraphModel(JUSTIFICATION_LEFT,
2128 rows[start].lmargin_ + rows[start].lindent_, 0, 0,
2129 Epsilon(rows[start].ri_->average_interword_space)));
2132 model = theory->AddModel(ParagraphModel(JUSTIFICATION_RIGHT,
2133 rows[start].rmargin_ + rows[start].rmargin_, 0, 0,
2134 Epsilon(rows[start].ri_->average_interword_space)));
2137 rows[start].SetUnknown();
2138 rows[start].AddStartLine(model);
2139 for (
int i = start + 1; i < end; i++) {
2140 rows[i].SetUnknown();
2141 rows[i].AddBodyLine(model);
2144 p->has_drop_cap = rows[start].ri_->has_drop_cap;
2146 ? rows[start].ri_->rword_indicates_list_item
2147 : rows[start].ri_->lword_indicates_list_item;
2148 for (
int row = start; row < end; row++) {
2149 if ((*row_owners)[row] !=
nullptr) {
2151 "Memory leak! ConvertHypothesizeModelRunsToParagraphs() called "
2152 "more than once!\n");
2153 delete (*row_owners)[row];
2155 (*row_owners)[row] = p;
2177static bool RowIsStranded(
const std::vector<RowScratchRegisters> &rows,
int row) {
2179 rows[row].StrongHypotheses(&row_models);
2181 for (
auto &row_model : row_models) {
2182 bool all_starts = rows[row].GetLineType();
2184 bool continues =
true;
2185 for (
int i = row - 1; i >= 0 && continues; i--) {
2187 rows[i].NonNullHypotheses(&models);
2188 switch (rows[i].GetLineType(row_model)) {
2203 for (
unsigned i = row + 1; i < rows.size() && continues; i++) {
2205 rows[i].NonNullHypotheses(&models);
2206 switch (rows[i].GetLineType(row_model)) {
2220 if (run_length > 2 || (!all_starts && run_length > 1)) {
2233static void LeftoverSegments(
const std::vector<RowScratchRegisters> &rows,
2234 std::vector<Interval> *to_fix,
int row_start,
int row_end) {
2236 for (
int i = row_start; i < row_end; i++) {
2237 bool needs_fixing =
false;
2241 rows[i].StrongHypotheses(&models);
2242 rows[i].NonNullHypotheses(&models_w_crowns);
2243 if (models.empty() && !models_w_crowns.empty()) {
2245 for (
unsigned end = i + 1; end < rows.size(); end++) {
2248 rows[end].NonNullHypotheses(&end_models);
2249 rows[end].StrongHypotheses(&strong_end_models);
2250 if (end_models.empty()) {
2251 needs_fixing =
true;
2253 }
else if (!strong_end_models.empty()) {
2254 needs_fixing =
false;
2258 }
else if (models.empty() && rows[i].ri_->num_words > 0) {
2260 needs_fixing =
true;
2263 if (!needs_fixing && !models.empty()) {
2264 needs_fixing = RowIsStranded(rows, i);
2268 if (!to_fix->empty() && to_fix->back().end == i - 1) {
2269 to_fix->back().end = i;
2271 to_fix->push_back(Interval(i, i));
2276 for (
auto &i : *to_fix) {
2285 std::vector<PARA *> &rows = *row_owners;
2286 paragraphs->clear();
2287 PARA_IT out(paragraphs);
2288 PARA *formerly_null =
nullptr;
2289 for (
unsigned i = 0; i < rows.size(); i++) {
2290 if (rows[i] ==
nullptr) {
2291 if (i == 0 || rows[i - 1] != formerly_null) {
2292 rows[i] = formerly_null =
new PARA();
2294 rows[i] = formerly_null;
2297 }
else if (i > 0 && rows[i - 1] == rows[i]) {
2300 out.add_after_then_move(rows[i]);
2315 std::vector<PARA *> *row_owners, PARA_LIST *paragraphs,
2316 std::vector<ParagraphModel *> *models) {
2320 row_owners->clear();
2321 row_owners->resize(row_infos->size());
2324 std::vector<RowScratchRegisters> rows(row_infos->size());
2325 for (
unsigned i = 0; i < row_infos->size(); i++) {
2326 rows[i].Init((*row_infos)[i]);
2334 SeparateSimpleLeaderLines(&rows, 0, rows.size(), &theory);
2336 DebugDump(debug_level > 1,
"End of Pass 1", theory, rows);
2338 std::vector<Interval> leftovers;
2339 LeftoverSegments(rows, &leftovers, 0, rows.size());
2340 for (
auto &leftover : leftovers) {
2346 StrongEvidenceClassify(debug_level, &rows, leftover.begin, leftover.end, &theory);
2352 std::vector<Interval> leftovers2;
2353 LeftoverSegments(rows, &leftovers2, leftover.begin, leftover.end);
2354 bool pass2a_was_useful =
2355 leftovers2.size() > 1 ||
2356 (leftovers2.size() == 1 && (leftovers2[0].begin != 0 ||
static_cast<size_t>(leftovers2[0].end) != rows.size()));
2357 if (pass2a_was_useful) {
2358 for (
auto &leftover2 : leftovers2) {
2359 StrongEvidenceClassify(debug_level, &rows, leftover2.begin, leftover2.end, &theory);
2364 DebugDump(debug_level > 1,
"End of Pass 2", theory, rows);
2370 LeftoverSegments(rows, &leftovers, 0, rows.size());
2371 for (
auto &leftover : leftovers) {
2372 GeometricClassify(debug_level, &rows, leftover.begin, leftover.end, &theory);
2376 DowngradeWeakestToCrowns(debug_level, &theory, &rows);
2378 DebugDump(debug_level > 1,
"End of Pass 3", theory, rows);
2382 LeftoverSegments(rows, &leftovers, 0, rows.size());
2383 for (
auto &leftover : leftovers) {
2384 for (
int j = leftover.begin; j < leftover.end; j++) {
2385 rows[j].SetUnknown();
2389 DebugDump(debug_level > 1,
"End of Pass 4", theory, rows);
2392 ConvertHypothesizedModelRunsToParagraphs(debug_level, rows, row_owners, &theory);
2394 DebugDump(debug_level > 0,
"Final Paragraph Segmentation", theory, rows);
2402static void InitializeTextAndBoxesPreRecognition(
const MutableIterator &it, RowInfo *info) {
2404 std::string fake_text;
2405 PageIterator pit(
static_cast<const PageIterator &
>(it));
2406 bool first_word =
true;
2411 info->lword_text +=
"x";
2413 info->rword_text +=
"x";
2417 info->rword_text =
"";
2422 if (fake_text.empty()) {
2426 int lspaces = info->pix_ldistance / info->average_interword_space;
2427 for (
int i = 0; i < lspaces; i++) {
2430 info->text += fake_text;
2433 PAGE_RES_IT page_res_it = *it.PageResIt();
2434 WERD_RES *word_res = page_res_it.restart_row();
2435 ROW_RES *this_row = page_res_it.row();
2437 WERD_RES *lword =
nullptr;
2438 WERD_RES *rword =
nullptr;
2439 info->num_words = 0;
2445 if (rword != word_res) {
2450 word_res = page_res_it.forward();
2451 }
while (page_res_it.row() == this_row);
2454 info->lword_box = lword->word->bounding_box();
2457 info->rword_box = rword->word->bounding_box();
2463static void InitializeRowInfo(
bool after_recognition,
const MutableIterator &it, RowInfo *info) {
2464 if (it.PageResIt()->row() !=
nullptr) {
2465 ROW *row = it.PageResIt()->row()->row;
2466 info->pix_ldistance = row->lmargin();
2467 info->pix_rdistance = row->rmargin();
2468 info->average_interword_space =
2469 row->space() > 0 ? row->space() : std::max(
static_cast<int>(row->x_height()), 1);
2470 info->pix_xheight = row->x_height();
2471 info->has_leaders =
false;
2472 info->has_drop_cap = row->has_drop_cap();
2475 info->pix_ldistance = info->pix_rdistance = 0;
2476 info->average_interword_space = 1;
2477 info->pix_xheight = 1.0;
2478 info->has_leaders =
false;
2479 info->has_drop_cap =
false;
2483 info->num_words = 0;
2484 info->lword_indicates_list_item =
false;
2485 info->lword_likely_starts_idea =
false;
2486 info->lword_likely_ends_idea =
false;
2487 info->rword_indicates_list_item =
false;
2488 info->rword_likely_starts_idea =
false;
2489 info->rword_likely_ends_idea =
false;
2490 info->has_leaders =
false;
2493 if (!after_recognition) {
2494 InitializeTextAndBoxesPreRecognition(it, info);
2498 const std::unique_ptr<const char[]> text(it.GetUTF8Text(
RIL_TEXTLINE));
2499 int trailing_ws_idx = strlen(text.get());
2500 while (trailing_ws_idx > 0 &&
2502 isascii(text[trailing_ws_idx - 1]) && isspace(text[trailing_ws_idx - 1])) {
2505 if (trailing_ws_idx > 0) {
2506 int lspaces = info->pix_ldistance / info->average_interword_space;
2507 for (
int i = 0; i < lspaces; i++) {
2510 for (
int i = 0; i < trailing_ws_idx; i++) {
2511 info->text += text[i];
2515 if (info->text.empty()) {
2519 PAGE_RES_IT page_res_it = *it.PageResIt();
2520 std::vector<WERD_RES *> werds;
2521 WERD_RES *word_res = page_res_it.restart_row();
2522 ROW_RES *this_row = page_res_it.row();
2523 int num_leaders = 0;
2527 if (word_res && word_res->best_choice->unichar_string().length() > 0) {
2528 werds.push_back(word_res);
2529 ltr += word_res->AnyLtrCharsInWord() ? 1 : 0;
2530 rtl += word_res->AnyRtlCharsInWord() ? 1 : 0;
2535 word_res = page_res_it.forward();
2536 }
while (page_res_it.row() == this_row);
2537 info->ltr = ltr >= rtl;
2538 info->has_leaders = num_leaders > 3;
2539 info->num_words = werds.size();
2540 if (!werds.empty()) {
2541 WERD_RES *lword = werds[0], *rword = werds[werds.size() - 1];
2542 info->lword_text = lword->best_choice->unichar_string().c_str();
2543 info->rword_text = rword->best_choice->unichar_string().c_str();
2544 info->lword_box = lword->word->bounding_box();
2545 info->rword_box = rword->word->bounding_box();
2547 &info->lword_indicates_list_item, &info->lword_likely_starts_idea,
2548 &info->lword_likely_ends_idea);
2550 &info->rword_indicates_list_item, &info->rword_likely_starts_idea,
2551 &info->rword_likely_ends_idea);
2559 const MutableIterator *block_start, std::vector<ParagraphModel *> *models) {
2575 std::vector<RowInfo> row_infos;
2581 row_infos.emplace_back();
2582 RowInfo &ri = row_infos.back();
2583 InitializeRowInfo(after_text_recognition, row, &ri);
2588 if (!row_infos.empty()) {
2589 int min_lmargin = row_infos[0].pix_ldistance;
2590 int min_rmargin = row_infos[0].pix_rdistance;
2591 for (
unsigned i = 1; i < row_infos.size(); i++) {
2592 if (row_infos[i].pix_ldistance < min_lmargin) {
2593 min_lmargin = row_infos[i].pix_ldistance;
2595 if (row_infos[i].pix_rdistance < min_rmargin) {
2596 min_rmargin = row_infos[i].pix_rdistance;
2599 if (min_lmargin > 0 || min_rmargin > 0) {
2600 for (
auto &row_info : row_infos) {
2601 row_info.pix_ldistance -= min_lmargin;
2602 row_info.pix_rdistance -= min_rmargin;
2608 std::vector<PARA *> row_owners;
2609 std::vector<PARA *> the_paragraphs;
2610 if (!is_image_block) {
2613 row_owners.resize(row_infos.size());
2619 for (
auto &row_owner : row_owners) {
@ W_REP_CHAR
repeated character
bool NearlyEqual(T x, T y, T tolerance)
bool StrongModel(const ParagraphModel *model)
void tprintf(const char *format,...)
std::vector< const ParagraphModel * > SetOfModels
int InterwordSpace(const std::vector< RowScratchRegisters > &rows, int row_start, int row_end)
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after, tesseract::ParagraphJustification justification)
bool RowsFitModel(const std::vector< RowScratchRegisters > *rows, int start, int end, const ParagraphModel *model)
void RecomputeMarginsAndClearHypotheses(std::vector< RowScratchRegisters > *rows, int start, int end, int percentile)
bool ValidBodyLine(const std::vector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
void CanonicalizeDetectionResults(std::vector< PARA * > *row_owners, PARA_LIST *paragraphs)
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
const ParagraphModel * kCrownLeft
void push_back_new(std::vector< T > &vector, const T &data)
const ParagraphModel * kCrownRight
void UpdateRange(const T1 &x, T2 *lower_bound, T2 *upper_bound)
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
bool CrownCompatible(const std::vector< RowScratchRegisters > *rows, int a, int b, const ParagraphModel *model)
bool ValidFirstLine(const std::vector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
bool AsciiLikelyListItem(const std::string &word)
void DetectParagraphs(int debug_level, std::vector< RowInfo > *row_infos, std::vector< PARA * > *row_owners, PARA_LIST *paragraphs, std::vector< ParagraphModel * > *models)
bool contains(const std::vector< T > &data, const T &value)
bool Empty(PageIteratorLevel level) const
bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const override
bool Next(PageIteratorLevel level) override
const PAGE_RES_IT * PageResIt() const
UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)
unsigned SkipDigits(unsigned pos)
unsigned SkipRomans(unsigned pos)
unsigned SkipPunc(unsigned pos)
unsigned SkipAlpha(unsigned pos)
Cluster(int cen, int num)
void GetClusters(std::vector< Cluster > *clusters)
SimpleClusterer(int max_cluster_width)
bool IsFullRow(int i) const
std::vector< Cluster > right_tabs
void Fail(int min_debug_level, const char *why) const
bool FirstWordWouldHaveFit(int row_a, int row_b)
std::vector< RowScratchRegisters > * rows
GeometricClassifierState(int dbg_level, std::vector< RowScratchRegisters > *r, int r_start, int r_end)
ParagraphModel Model() const
void AssumeLeftJustification()
tesseract::ParagraphJustification just
const std::vector< Cluster > & AlignTabs() const
void AssumeRightJustification()
const std::vector< Cluster > & OffsideTabs() const
int AlignsideTabIndex(int row_idx) const
std::vector< Cluster > left_tabs
int average_interword_space
void StartHypotheses(SetOfModels *models) const
const ParagraphModel * UniqueStartHypothesis() const
void NonNullHypotheses(SetOfModels *models) const
void AddBodyLine(const ParagraphModel *model)
void StrongHypotheses(SetOfModels *models) const
LineType GetLineType() const
static void AppendDebugHeaderFields(std::vector< std::string > &header)
void AppendDebugInfo(const ParagraphTheory &theory, std::vector< std::string > &dbg) const
int OffsideIndent(tesseract::ParagraphJustification just) const
void DiscardNonMatchingHypotheses(const SetOfModels &models)
void AddStartLine(const ParagraphModel *model)
const ParagraphModel * UniqueBodyHypothesis() const
void Init(const RowInfo &row)
void NonCenteredModels(SetOfModels *models)
std::vector< ParagraphModel * > & models()
const ParagraphModel * Fits(const std::vector< RowScratchRegisters > *rows, int start, int end) const
void DiscardUnusedModels(const SetOfModels &used_models)
int IndexOf(const ParagraphModel *model) const
const ParagraphModel * AddModel(const ParagraphModel &model)
ParagraphModelSmearer(std::vector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
PDBLK pdblk
Page Description Block.
bool ValidFirstLine(int lmargin, int lindent, int rindent, int rmargin) const
bool ValidBodyLine(int lmargin, int lindent, int rindent, int rmargin) const
BLOCK_RES * block() const
POLY_BLOCK * poly_block() const
UNICHAR_ID unichar_id(unsigned index) const
void add(int32_t value, int32_t count)
double ile(double frac) const
bool get_isupper(UNICHAR_ID unichar_id) const
bool get_isdigit(UNICHAR_ID unichar_id) const
bool get_ispunctuation(UNICHAR_ID unichar_id) const