41static BOOL_VAR(textord_all_prop,
false,
"All doc is proportial text");
43static BOOL_VAR(textord_disable_pitch_test,
false,
"Turn off dp fixed pitch algorithm");
52#define BLOCK_STATS_CLUSTERS 10
53#define MAX_ALLOWED_PITCH 100
56static int sort_floats(
const void *arg1,
const void *arg2) {
57 float diff = *
reinterpret_cast<const float *
>(arg1) - *
reinterpret_cast<const float *
>(arg2);
60 }
else if (diff < 0) {
76 TO_BLOCK_LIST *port_blocks,
86#ifndef GRAPHICS_DISABLED
94 block_it.set_to_list(port_blocks);
96 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
97 block = block_it.data();
104 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
105 block = block_it.data();
114 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
115 block = block_it.data();
117 if (pb !=
nullptr && !pb->
IsText()) {
121 TO_ROW_IT row_it(block->
get_rows());
123 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
125 fix_row_pitch(row, block, port_blocks, row_index, block_index);
130#ifndef GRAPHICS_DISABLED
146 TO_BLOCK_LIST *blocks,
148 int32_t block_target) {
156 TO_BLOCK_IT block_it = blocks;
163 block_votes = like_votes = other_votes = 0;
169 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
170 block = block_it.data();
172 if (pb !=
nullptr && !pb->
IsText()) {
176 TO_ROW_IT row_it(block->
get_rows());
177 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
187 if (block_index == block_target) {
236 }
else if (block_votes <= textord_words_veto_power && like_votes > 0) {
241 if (block_votes == 0 && like_votes == 0 && other_votes > 0 &&
244 "Warning:row %d of block %d set prop with no like rows against "
246 row_target, block_target);
251 tprintf(
":b_votes=%d:l_votes=%d:o_votes=%d", block_votes, like_votes, other_votes);
256 if (block_votes > 0) {
258 }
else if (block_votes == 0 && like_votes > 0) {
261 tprintf(
"Warning:guessing pitch as xheight on row %d, block %d\n", row_target,
278 sp_sd, mid_cuts, &bad_row->
char_cells,
false);
301 tprintf(
"Block %d at (%d,%d)->(%d,%d)\n", block_index, block_box.
left(), block_box.
bottom(),
302 block_box.
right(), block_box.
top());
314#ifndef GRAPHICS_DISABLED
339 TO_ROW_IT row_it = block->
get_rows();
342 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
373 TO_BLOCK_LIST *port_blocks,
383 TO_BLOCK_IT block_it = port_blocks;
386 int16_t projection_left;
387 int16_t projection_right;
406 shift_factor = gradient / (gradient * gradient + 1);
408 TO_ROW_IT row_it(block_it.data()->get_rows());
409 master_x = row_it.data()->projection_left;
410 master_y = row_it.data()->baseline.y(master_x);
411 projection_left = INT16_MAX;
412 projection_right = -INT16_MAX;
417 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
418 block = block_it.data();
419 row_it.set_to_list(block->
get_rows());
420 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
428 row_left =
static_cast<int16_t
>(row->
projection_left - shift_factor * (master_y - row_y));
429 row_right =
static_cast<int16_t
>(row->
projection_right - shift_factor * (master_y - row_y));
430 if (row_left < projection_left) {
431 projection_left = row_left;
433 if (row_right > projection_right) {
434 projection_right = row_right;
441 projection.
set_range(projection_left, projection_right - 1);
443 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
444 block = block_it.data();
445 row_it.set_to_list(block->
get_rows());
446 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
449 row_left =
static_cast<int16_t
>(row->
projection_left - shift_factor * (master_y - row_y));
450 for (x = row->
projection_left; x < row->projection_right; x++, row_left++) {
456 row_it.set_to_list(block_it.data()->get_rows());
458#ifndef GRAPHICS_DISABLED
463 final_pitch = pitches.
ile(0.5);
464 pitch =
static_cast<int16_t
>(final_pitch);
465 pitch_sd =
tune_row_pitch(row, &projection, projection_left, projection_right, pitch * 0.75,
466 final_pitch, sp_sd, mid_cuts, &row->
char_cells,
false);
470 "try_doc:props=%d:fixed=%d:pitch=%d:final_pitch=%g:pitch_sd=%g:sp_sd=%"
471 "g:sd/trc=%g:sd/p=%g:sd/trc/p=%g\n",
472 prop_blocks, fixed_blocks, pitch, final_pitch, pitch_sd, sp_sd, pitch_sd / total_row_count,
473 pitch_sd / pitch, pitch_sd / total_row_count / pitch);
476#ifndef GRAPHICS_DISABLED
479 ICOORDELT_LIST *master_cells;
481 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
482 block = block_it.data();
483 row_it.set_to_list(block->
get_rows());
484 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
487 row_shift = shift_factor * (master_y - row_y);
523 int32_t def_fixed = 0;
524 int32_t def_prop = 0;
525 int32_t maybe_fixed = 0;
526 int32_t maybe_prop = 0;
528 int32_t corr_fixed = 0;
529 int32_t corr_prop = 0;
531 TO_ROW_IT row_it = block->
get_rows();
534 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
547 count_block_votes(block, def_fixed, def_prop, maybe_fixed, maybe_prop, corr_fixed, corr_prop,
558 }
else if (def_fixed > 0 || def_prop > 0) {
580 int32_t def_fixed = 0;
581 int32_t def_prop = 0;
582 int32_t maybe_fixed = 0;
583 int32_t maybe_prop = 0;
585 int32_t corr_fixed = 0;
586 int32_t corr_prop = 0;
588 count_block_votes(block, def_fixed, def_prop, maybe_fixed, maybe_prop, corr_fixed, corr_prop,
590 tprintf(
"Block %d has (%d,%d,%d)", block_index, def_fixed, maybe_fixed, corr_fixed);
594 tprintf(
" fixed, (%d,%d,%d)", def_prop, maybe_prop, corr_prop);
598 tprintf(
" prop, %d dunno\n", dunno);
610 int32_t &def_prop, int32_t &maybe_fixed, int32_t &maybe_prop, int32_t &corr_fixed,
611 int32_t &corr_prop, int32_t &dunno) {
613 TO_ROW_IT row_it = block->
get_rows();
615 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
657 int32_t cluster_count;
659 int32_t smooth_factor;
666 STATS gap_stats(0, maxwidth - 1);
671 if (!blob_it.empty()) {
672 prev_x = blob_it.data()->bounding_box().right();
674 while (!blob_it.at_first()) {
675 blob = blob_it.data();
678 if (blob_box.
left() - prev_x < maxwidth) {
679 gap_stats.
add(blob_box.
left() - prev_x, 1);
681 prev_x = blob_box.
right();
692 gap_stats.
smooth(smooth_factor);
694 prev_count = cluster_count;
698 if (cluster_count < 1) {
701 for (gap_index = 0; gap_index < cluster_count; gap_index++) {
702 gaps[gap_index] = cluster_stats[gap_index + 1].
ile(0.5);
706 tprintf(
"cluster_count=%d:", cluster_count);
707 for (gap_index = 0; gap_index < cluster_count; gap_index++) {
708 tprintf(
" %g(%d)", gaps[gap_index], cluster_stats[gap_index + 1].get_total());
712 qsort(gaps, cluster_count,
sizeof(
float), sort_floats);
717 for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] < lower; gap_index++) {
720 if (gap_index == 0) {
722 tprintf(
"No clusters below nonspace threshold!!\n");
724 if (cluster_count > 1) {
732 row->
pr_nonsp = gaps[gap_index - 1];
733 while (gap_index < cluster_count && gaps[gap_index] < upper) {
736 if (gap_index == cluster_count) {
738 tprintf(
"No clusters above nonspace threshold!!\n");
748 for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] < upper; gap_index++) {
751 if (gap_index == 0) {
753 tprintf(
"No clusters below space threshold!!\n");
758 row->
fp_nonsp = gaps[gap_index - 1];
759 if (gap_index == cluster_count) {
761 tprintf(
"No clusters above space threshold!!\n");
770 "Initial estimates:pr_nonsp=%g, pr_space=%g, fp_nonsp=%g, "
804 STATS gap_stats(0, maxwidth - 1);
806 STATS pitch_stats(0, maxwidth - 1);
814 if (non_space > initial_pitch) {
815 non_space = initial_pitch;
817 min_space = (initial_pitch + non_space) / 2;
819 if (!
count_pitch_stats(row, &gap_stats, &pitch_stats, initial_pitch, min_space,
true,
false,
821 dm_gap_iqr = 0.0001f;
822 dm_pitch_iqr = maxwidth * 2.0f;
823 dm_pitch = initial_pitch;
825 dm_gap_iqr = gap_stats.
ile(0.75) - gap_stats.
ile(0.25);
826 dm_pitch_iqr = pitch_stats.
ile(0.75) - pitch_stats.
ile(0.25);
827 dm_pitch = pitch_stats.
ile(0.5);
831 if (!
count_pitch_stats(row, &gap_stats, &pitch_stats, initial_pitch, min_space,
true,
false, 0)) {
833 pitch_iqr = maxwidth * 3.0f;
835 gap_iqr = gap_stats.
ile(0.75) - gap_stats.
ile(0.25);
836 pitch_iqr = pitch_stats.
ile(0.75) - pitch_stats.
ile(0.25);
839 "First fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, "
841 initial_pitch, gap_iqr, pitch_iqr, pitch_stats.
ile(0.5));
843 initial_pitch = pitch_stats.
ile(0.5);
844 if (min_space > initial_pitch &&
count_pitch_stats(row, &gap_stats, &pitch_stats, initial_pitch,
845 initial_pitch,
true,
false, 0)) {
846 min_space = initial_pitch;
847 gap_iqr = gap_stats.
ile(0.75) - gap_stats.
ile(0.25);
848 pitch_iqr = pitch_stats.
ile(0.75) - pitch_stats.
ile(0.25);
851 "Revised fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, "
853 initial_pitch, gap_iqr, pitch_iqr, pitch_stats.
ile(0.5));
855 initial_pitch = pitch_stats.
ile(0.5);
859 tprintf(
"Blk=%d:Row=%d:%c:p_iqr=%g:g_iqr=%g:dm_p_iqr=%g:dm_g_iqr=%g:%c:", block_index,
860 row_index,
'X', pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr,
861 pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth
863 : (pitch_iqr * dm_gap_iqr <= dm_pitch_iqr * gap_iqr ?
'S' :
'M'));
865 if (pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth) {
872 if (pitch_iqr * dm_gap_iqr <= dm_pitch_iqr * gap_iqr) {
875 "Choosing non dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, "
877 pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr);
879 gap_iqr = gap_stats.
ile(0.75) - gap_stats.
ile(0.25);
880 pitch_iqr = pitch_stats.
ile(0.75) - pitch_stats.
ile(0.25);
881 pitch = pitch_stats.
ile(0.5);
882 used_dm_model =
false;
886 "Choosing dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, "
888 pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr);
890 gap_iqr = dm_gap_iqr;
891 pitch_iqr = dm_pitch_iqr;
893 used_dm_model =
true;
896 tprintf(
"rev_p_iqr=%g:rev_g_iqr=%g:pitch=%g:", pitch_iqr, gap_iqr, pitch);
897 tprintf(
"p_iqr/g=%g:p_iqr/x=%g:iqr_res=%c:", pitch_iqr / gap_iqr, pitch_iqr / block->
xheight,
936 const char *res_string;
947 if (textord_all_prop || (pb !=
nullptr && !pb->
IsText())) {
955 if (pitch_sd < textord_words_pitchsd_threshold * row->fixed_pitch &&
958 (row->
used_dm_model || sp_sd > 20 || (pitch_sd == 0 && sp_sd > 10))))) {
959 if (pitch_sd < textord_words_def_fixed * row->fixed_pitch && !row->
all_caps &&
967 if (pitch_sd < textord_words_def_prop * row->fixed_pitch) {
995 tprintf(
":sd/p=%g:occ=%g:init_res=%s\n", pitch_sd / row->
fixed_pitch, sp_sd, res_string);
1013 float initial_pitch,
1015 bool ignore_outsize,
1024 int32_t prev_centre;
1027 int32_t width_units;
1033 pitch_stats->
clear();
1034 if (blob_it.empty()) {
1040 joined_box = blob_it.data()->bounding_box();
1043 blob = blob_it.data();
1046 if ((blob_box.
left() - joined_box.
right() < dm_gap && !blob_it.at_first()) ||
1047 blob->
cblob() ==
nullptr) {
1048 joined_box += blob_box;
1050 blob_width = joined_box.
width();
1051 if (split_outsize) {
1053 static_cast<int32_t
>(floor(
static_cast<float>(blob_width) / initial_pitch + 0.5));
1054 if (width_units < 1) {
1058 }
else if (ignore_outsize) {
1059 width =
static_cast<float>(blob_width) / initial_pitch;
1066 x_centre =
static_cast<int32_t
>(joined_box.
left() +
1067 (blob_width - width_units * initial_pitch) / 2);
1068 if (prev_valid && width_units >= 0) {
1077 gap_stats->
add(joined_box.
left() - prev_right, 1);
1078 pitch_stats->
add(x_centre - prev_centre, 1);
1080 prev_centre =
static_cast<int32_t
>(x_centre + width_units * initial_pitch);
1081 prev_right = joined_box.
right();
1082 prev_valid = blob_box.
left() - joined_box.
right() < min_space;
1083 prev_valid = prev_valid && width_units >= 0;
1084 joined_box = blob_box;
1087 }
while (!blob_it.at_first());
1101 int16_t projection_left,
1102 int16_t projection_right,
1104 float &initial_pitch,
1106 int16_t &best_mid_cuts,
1107 ICOORDELT_LIST *best_cells,
1117 ICOORDELT_LIST test_cells;
1118 ICOORDELT_IT best_it;
1121 return tune_row_pitch2(row, projection, projection_left, projection_right, space_size,
1122 initial_pitch, best_sp_sd,
1124 best_mid_cuts, best_cells, testing_on);
1126 if (textord_disable_pitch_test) {
1127 best_sp_sd = initial_pitch;
1128 return initial_pitch;
1130 initial_sd =
compute_pitch_sd(row, projection, projection_left, projection_right, space_size,
1131 initial_pitch, best_sp_sd, best_mid_cuts, best_cells, testing_on);
1132 best_sd = initial_sd;
1133 best_pitch = initial_pitch;
1135 tprintf(
"tune_row_pitch:start pitch=%g, sd=%g\n", best_pitch, best_sd);
1139 compute_pitch_sd(row, projection, projection_left, projection_right, space_size,
1140 initial_pitch + pitch_delta, sp_sd, mid_cuts, &test_cells, testing_on);
1142 tprintf(
"testing pitch at %g, sd=%g\n", initial_pitch + pitch_delta, pitch_sd);
1144 if (pitch_sd < best_sd) {
1146 best_mid_cuts = mid_cuts;
1148 best_pitch = initial_pitch + pitch_delta;
1149 best_cells->clear();
1150 best_it.set_to_list(best_cells);
1151 best_it.add_list_after(&test_cells);
1155 if (pitch_sd > initial_sd) {
1161 compute_pitch_sd(row, projection, projection_left, projection_right, space_size,
1162 initial_pitch - pitch_delta, sp_sd, mid_cuts, &test_cells, testing_on);
1164 tprintf(
"testing pitch at %g, sd=%g\n", initial_pitch - pitch_delta, pitch_sd);
1166 if (pitch_sd < best_sd) {
1168 best_mid_cuts = mid_cuts;
1170 best_pitch = initial_pitch - pitch_delta;
1171 best_cells->clear();
1172 best_it.set_to_list(best_cells);
1173 best_it.add_list_after(&test_cells);
1177 if (pitch_sd > initial_sd) {
1181 initial_pitch = best_pitch;
1184 print_pitch_sd(row, projection, projection_left, projection_right, space_size, best_pitch);
1200 int16_t projection_left,
1201 int16_t projection_right,
1203 float &initial_pitch,
1205 int16_t &best_mid_cuts,
1206 ICOORDELT_LIST *best_cells,
1219 best_sp_sd = initial_pitch;
1221 best_pitch =
static_cast<int>(initial_pitch);
1223 return initial_pitch;
1230 for (pixel = projection_left; pixel <= projection_right; pixel++) {
1233 (pixel - projection_left) % (best_pitch + pitch_delta), projection->
pile_count(pixel));
1240 for (pixel = 0; pixel < best_pitch + pitch_delta; pixel++) {
1243 best_delta = pitch_delta;
1249 tprintf(
"tune_row_pitch:start pitch=%g, best_delta=%d, count=%d\n", initial_pitch, best_delta,
1252 best_pitch += best_delta;
1253 initial_pitch = best_pitch;
1255 best_count += best_count;
1256 for (start = best_pixel - 2;
1257 start > best_pixel - best_pitch &&
1262 for (end = best_pixel + 2;
1263 end < best_pixel + best_pitch &&
1269 best_sd =
compute_pitch_sd(row, projection, projection_left, projection_right, space_size,
1270 initial_pitch, best_sp_sd, best_mid_cuts, best_cells, testing_on,
1273 tprintf(
"tune_row_pitch:output pitch=%g, sd=%g\n", initial_pitch, best_sd);
1277 print_pitch_sd(row, projection, projection_left, projection_right, space_size, initial_pitch);
1293 int16_t projection_left,
1294 int16_t projection_right,
1296 float initial_pitch,
1299 ICOORDELT_LIST *row_cells,
1307 BLOBNBOX_IT start_it;
1308 BLOBNBOX_IT plot_it;
1315 FPSEGPT_LIST seg_list;
1320 ICOORDELT_IT cell_it = row_cells;
1326 int32_t total_count;
1329 word_sync =
compute_pitch_sd2(row, projection, projection_left, projection_right, initial_pitch,
1330 occupation, mid_cuts, row_cells, testing_on, start, end);
1341 if (blob_it.empty()) {
1342 return space_size * 10;
1344#ifndef GRAPHICS_DISABLED
1345 if (testing_on &&
to_win !=
nullptr) {
1346 blob_box = blob_it.data()->bounding_box();
1353 blob_it.mark_cycle_pt();
1355 for (; blob_count > 0; blob_count--) {
1359 prev_box = blob_box;
1362 }
while (!blob_it.cycled_list() && blob_box.
left() - prev_box.
right() < space_size);
1365 word_sync =
check_pitch_sync2(&start_it, blob_count,
static_cast<int16_t
>(initial_pitch), 2,
1366 projection, projection_left, projection_right,
1370 word_sync =
check_pitch_sync(&start_it, blob_count,
static_cast<int16_t
>(initial_pitch), 2,
1371 projection, &seg_list);
1374 tprintf(
"Word ending at (%d,%d), len=%d, sync rating=%g, ", prev_box.
right(), prev_box.
top(),
1375 seg_list.length() - 1, word_sync);
1376 seg_it.set_to_list(&seg_list);
1377 for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) {
1378 if (seg_it.data()->faked) {
1381 tprintf(
"%d, ", seg_it.data()->position());
1389#ifndef GRAPHICS_DISABLED
1394 seg_it.set_to_list(&seg_list);
1395 if (prev_right >= 0) {
1396 sp_var = seg_it.data()->position() - prev_right;
1397 sp_var -= floor(sp_var / initial_pitch + 0.5) * initial_pitch;
1402 for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) {
1403 segpos = seg_it.data()->position();
1404 if (cell_it.empty() || segpos > cellpos + initial_pitch / 2) {
1406 while (!cell_it.empty() && segpos > cellpos + initial_pitch * 3 / 2) {
1407 cell =
new ICOORDELT(cellpos +
static_cast<int16_t
>(initial_pitch), 0);
1408 cell_it.add_after_then_move(cell);
1409 cellpos +=
static_cast<int16_t
>(initial_pitch);
1413 cell_it.add_after_then_move(cell);
1415 }
else if (segpos > cellpos - initial_pitch / 2) {
1416 cell = cell_it.data();
1418 cell->
set_x((cellpos + segpos) / 2);
1419 cellpos = cell->
x();
1422 seg_it.move_to_last();
1423 prev_right = seg_it.data()->position();
1425 scale_factor = (seg_list.length() - 2) / 2;
1426 if (scale_factor < 1) {
1432 sqsum += word_sync * scale_factor;
1433 total_count += (seg_list.length() - 1) * scale_factor;
1435 }
while (!blob_it.cycled_list());
1436 sp_sd = sp_count > 0 ? sqrt(spsum / sp_count) : 0;
1437 return total_count > 0 ? sqrt(sqsum / total_count) : space_size * 10;
1450 int16_t projection_left,
1451 int16_t projection_right,
1452 float initial_pitch,
1453 int16_t &occupation,
1455 ICOORDELT_LIST *row_cells,
1462 BLOBNBOX_IT plot_it;
1465 FPSEGPT_LIST seg_list;
1469 ICOORDELT_IT cell_it = row_cells;
1474 if (blob_it.empty()) {
1476 return initial_pitch * 10;
1478#ifndef GRAPHICS_DISABLED
1479 if (testing_on &&
to_win !=
nullptr) {
1484 blob_it.mark_cycle_pt();
1489 }
while (!blob_it.cycled_list());
1492 &blob_it, blob_count,
static_cast<int16_t
>(initial_pitch), 2, projection, projection_left,
1495 tprintf(
"Row ending at (%d,%d), len=%d, sync rating=%g, ", blob_box.
right(), blob_box.
top(),
1496 seg_list.length() - 1, word_sync);
1497 seg_it.set_to_list(&seg_list);
1498 for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) {
1499 if (seg_it.data()->faked) {
1502 tprintf(
"%d, ", seg_it.data()->position());
1510#ifndef GRAPHICS_DISABLED
1515 seg_it.set_to_list(&seg_list);
1516 for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) {
1517 segpos = seg_it.data()->position();
1520 cell_it.add_after_then_move(cell);
1521 if (seg_it.at_last()) {
1522 mid_cuts = seg_it.data()->cheap_cuts();
1526 return occupation > 0 ? sqrt(word_sync / occupation) : initial_pitch * 10;
1539 int16_t projection_left,
1540 int16_t projection_right,
float space_size,
1548 BLOBNBOX_IT start_it;
1549 BLOBNBOX_IT row_start;
1551 int16_t total_blob_count;
1557 FPSEGPT_LIST seg_list;
1565 if (blob_it.empty()) {
1568 row_start = blob_it;
1569 total_blob_count = 0;
1576 blob_it = row_start;
1580 blob_it.mark_cycle_pt();
1582 for (; blob_count > 0; blob_count--) {
1586 prev_box = blob_box;
1589 }
while (!blob_it.cycled_list() && blob_box.
left() - prev_box.
right() < space_size);
1591 &start_it, blob_count,
static_cast<int16_t
>(initial_pitch), 2, projection, projection_left,
1593 total_blob_count += blob_count;
1594 seg_it.set_to_list(&seg_list);
1595 if (prev_right >= 0) {
1596 sp_var = seg_it.data()->position() - prev_right;
1597 sp_var -= floor(sp_var / initial_pitch + 0.5) * initial_pitch;
1602 seg_it.move_to_last();
1603 prev_right = seg_it.data()->position();
1605 scale_factor = (seg_list.length() - 2) / 2;
1606 if (scale_factor < 1) {
1612 sqsum += word_sync * scale_factor;
1613 total_count += (seg_list.length() - 1) * scale_factor;
1615 }
while (!blob_it.cycled_list());
1616 sp_sd = sp_count > 0 ? sqrt(spsum / sp_count) : 0;
1617 word_sync = total_count > 0 ? sqrt(sqsum / total_count) : space_size * 10;
1618 tprintf(
"new_sd=%g:sd/p=%g:new_sp_sd=%g:res=%c:", word_sync, word_sync / initial_pitch, sp_sd,
1621 start_it = row_start;
1622 blob_it = row_start;
1624 check_pitch_sync2(&blob_it, total_blob_count,
static_cast<int16_t
>(initial_pitch), 2,
1625 projection, projection_left, projection_right,
1627 if (occupation > 1) {
1628 word_sync /= occupation;
1630 word_sync = sqrt(word_sync);
1632#ifndef GRAPHICS_DISABLED
1639 if (word_sync < textord_words_def_fixed * initial_pitch && !row->all_caps) {
1648 "row_sd=%g:sd/p=%g:res=%c:N=%d:res2=%s,init pitch=%g, row_pitch=%g, "
1650 word_sync, word_sync / initial_pitch,
1664 if (pb !=
nullptr && !pb->
IsText()) {
1670 BLOBNBOX_IT search_it;
1673 int blobcount, repeated_set;
1675 TO_ROW_IT row_it = block->
get_rows();
1676 if (row_it.empty()) {
1679 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1680 row = row_it.data();
1682 if (box_it.empty()) {
1694 if (box_it.data()->repeated_set() != 0 && !box_it.data()->joined_to_prev()) {
1696 repeated_set = box_it.data()->repeated_set();
1698 search_it.forward();
1699 while (!search_it.at_first() && search_it.data()->repeated_set() == repeated_set) {
1701 search_it.forward();
1707 if (!box_it.empty() && box_it.data()->joined_to_prev()) {
1708 tprintf(
"Bad box joined to prev at");
1709 box_it.data()->bounding_box().print();
1710 tprintf(
"After repeated word:");
1713 ASSERT_HOST(box_it.empty() || !box_it.data()->joined_to_prev());
1716 word_it.add_after_then_move(word);
1720 }
while (!box_it.at_first());
1730#ifndef GRAPHICS_DISABLED
1737 TO_ROW_IT row_it = block->
get_rows();
1739 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1740 row = row_it.data();
1741 row->
min_space =
static_cast<int32_t
>((pitch + nonspace) / 2);
#define MAX_ALLOWED_PITCH
#define BLOCK_STATS_CLUSTERS
#define BOOL_VAR(name, val, comment)
#define double_VAR(name, val, comment)
@ W_DONT_CHOP
fixed pitch chopped
@ W_REP_CHAR
repeated character
int textord_dotmatrix_gap
void compute_fixed_pitch(ICOORD page_tr, TO_BLOCK_LIST *port_blocks, float gradient, FCOORD rotation, bool testing_on)
bool try_block_fixed(TO_BLOCK *block, int32_t block_index)
double words_initial_upper
void compute_block_pitch(TO_BLOCK *block, FCOORD rotation, int32_t block_index, bool testing_on)
bool textord_blocksall_prop
void plot_fp_cells2(ScrollView *win, ScrollView::Color colour, TO_ROW *row, FPSEGPT_LIST *seg_list)
double textord_wordstats_smooth_factor
double words_initial_lower
int textord_words_veto_power
bool fixed_pitch_row(TO_ROW *row, BLOCK *block, int32_t block_index)
void plot_fp_word(TO_BLOCK *block, float pitch, float nonspace)
double textord_words_default_nonspace
void tprintf(const char *format,...)
bool textord_show_fixed_cuts
float compute_pitch_sd2(TO_ROW *row, STATS *projection, int16_t projection_left, int16_t projection_right, float initial_pitch, int16_t &occupation, int16_t &mid_cuts, ICOORDELT_LIST *row_cells, bool testing_on, int16_t start, int16_t end)
double words_default_fixed_space
int pitsync_linear_version
void mark_repeated_chars(TO_ROW *row)
WERD * make_real_word(BLOBNBOX_IT *box_it, int32_t blobcount, bool bol, uint8_t blanks)
float compute_pitch_sd(TO_ROW *row, STATS *projection, int16_t projection_left, int16_t projection_right, float space_size, float initial_pitch, float &sp_sd, int16_t &mid_cuts, ICOORDELT_LIST *row_cells, bool testing_on, int16_t start, int16_t end)
bool try_rows_fixed(TO_BLOCK *block, int32_t block_index, bool testing_on)
double textord_words_default_maxspace
void find_repeated_chars(TO_BLOCK *block, bool testing_on)
double textord_projection_scale
void print_pitch_sd(TO_ROW *row, STATS *projection, int16_t projection_left, int16_t projection_right, float space_size, float initial_pitch)
bool textord_blockndoc_fixed
double check_pitch_sync2(BLOBNBOX_IT *blob_it, int16_t blob_count, int16_t pitch, int16_t pitch_error, STATS *projection, int16_t projection_left, int16_t projection_right, float projection_scale, int16_t &occupation_count, FPSEGPT_LIST *seg_list, int16_t start, int16_t end)
bool count_pitch_stats(TO_ROW *row, STATS *gap_stats, STATS *pitch_stats, float initial_pitch, float min_space, bool ignore_outsize, bool split_outsize, int32_t dm_gap)
bool textord_pitch_scalebigwords
double textord_words_min_minspace
bool find_row_pitch(TO_ROW *row, int32_t maxwidth, int32_t dm_gap, TO_BLOCK *block, int32_t block_index, int32_t row_index, bool testing_on)
void fix_row_pitch(TO_ROW *bad_row, TO_BLOCK *bad_block, TO_BLOCK_LIST *blocks, int32_t row_target, int32_t block_target)
bool textord_blocksall_fixed
bool textord_debug_pitch_metric
double textord_words_maxspace
float tune_row_pitch2(TO_ROW *row, STATS *projection, int16_t projection_left, int16_t projection_right, float space_size, float &initial_pitch, float &best_sp_sd, int16_t &best_mid_cuts, ICOORDELT_LIST *best_cells, bool testing_on)
void print_block_counts(TO_BLOCK *block, int32_t block_index)
bool textord_debug_pitch_test
bool row_pitch_stats(TO_ROW *row, int32_t maxwidth, bool testing_on)
double textord_balance_factor
bool textord_show_row_cuts
bool try_doc_fixed(ICOORD page_tr, TO_BLOCK_LIST *port_blocks, float gradient)
void plot_word_decisions(ScrollView *win, int16_t pitch, TO_ROW *row)
double words_default_prop_nonspace
bool textord_fast_pitch_test
double textord_fpiqr_ratio
ScrollView * create_to_win(ICOORD page_tr)
double textord_words_pitchsd_threshold
float tune_row_pitch(TO_ROW *row, STATS *projection, int16_t projection_left, int16_t projection_right, float space_size, float &initial_pitch, float &best_sp_sd, int16_t &best_mid_cuts, ICOORDELT_LIST *best_cells, bool testing_on)
void plot_row_cells(ScrollView *win, ScrollView::Color colour, TO_ROW *row, float xshift, ICOORDELT_LIST *cells)
bool textord_show_initial_words
void count_block_votes(TO_BLOCK *block, int32_t &def_fixed, int32_t &def_prop, int32_t &maybe_fixed, int32_t &maybe_prop, int32_t &corr_fixed, int32_t &corr_prop, int32_t &dunno)
double words_default_fixed_limit
double check_pitch_sync(BLOBNBOX_IT *blob_it, int16_t blob_count, int16_t pitch, int16_t pitch_error, STATS *projection, FPSEGPT_LIST *seg_list)
double textord_max_pitch_iqr
double textord_words_def_prop
double textord_words_default_minspace
bool textord_show_page_cuts
double textord_spacesize_ratioprop
bool compute_rows_pitch(TO_BLOCK *block, int32_t block_index, bool testing_on)
TBOX box_next(BLOBNBOX_IT *it)
double textord_pitch_rowsimilarity
const TBOX & bounding_box() const
bool joined_to_prev() const
bool rep_chars_marked() const
ICOORDELT_LIST char_cells
BLOBNBOX_LIST * blob_list()
int num_repeated_sets() const
void compute_vertical_projection()
PITCH_TYPE pitch_decision
PITCH_TYPE pitch_decision
PDBLK pdblk
Page Description Block.
POLY_BLOCK * poly_block() const
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
void set_x(TDimension xin)
rewrite function
TDimension x() const
access function
TDimension bottom() const
void add(int32_t value, int32_t count)
void plot(ScrollView *window, float xorigin, float yorigin, float xscale, float yscale, ScrollView::Color colour) const
int32_t pile_count(int32_t value) const
int32_t get_total() const
int32_t cluster(float lower, float upper, float multiple, int32_t max_clusters, STATS *clusters)
void smooth(int32_t factor)
double ile(double frac) const
bool set_range(int32_t min_bucket_value, int32_t max_bucket_value)
void set_flag(WERD_FLAGS mask, bool value)
TBOX bounding_box() const