18#define _USE_MATH_DEFINES
29#include <allheaders.h>
37#ifdef INCLUDE_TENSORFLOW
69#ifndef GRAPHICS_DISABLED
76 : randomly_rotate_(false), training_data_(0), sub_trainer_(nullptr) {
82 int debug_interval, int64_t max_memory)
83 : randomly_rotate_(false),
84 training_data_(max_memory),
85 sub_trainer_(nullptr) {
93#ifndef GRAPHICS_DISABLED
104 const char *old_traineddata) {
105 std::vector<char> data;
109 tprintf(
"Loaded file %s, unpacking...\n", filename);
114 tprintf(
"Error, %s is an integer (fast) model, cannot continue training\n",
118 if (((old_traineddata ==
nullptr || *old_traineddata ==
'\0') &&
120 filename == old_traineddata) {
125 if (old_traineddata ==
nullptr || *old_traineddata ==
'\0') {
126 tprintf(
"Must supply the old traineddata for code conversion!\n");
146 std::vector<int> code_map =
MapRecoder(old_chset, old_recoder);
163 int net_flags,
float weight_range,
164 float learning_rate,
float momentum,
172 append_index, net_flags, weight_range,
180 "Training parameters:\n Debug interval = %d,"
181 " weights = %g, learning rate = %g, momentum=%g\n",
189#ifdef INCLUDE_TENSORFLOW
192 TFNetwork *tf_net =
new TFNetwork(
"TensorFlow");
195 tprintf(
"InitFromProtoStr failed!!\n");
221 for (
int i = 0; i <
ET_COUNT; ++i) {
235 const ImageData *trainingdata,
int iteration,
double min_dict_ratio,
236 double dict_ratio_step,
double max_dict_ratio,
double min_cert_offset,
237 double cert_offset_step,
double max_cert_offset, std::string &results) {
247 std::vector<int> truth_labels, ocr_labels, xcoords;
258 results +=
"0,0=" + std::to_string(baseline_error);
261 for (
double r = min_dict_ratio; r < max_dict_ratio; r += dict_ratio_step) {
262 for (
double c = min_cert_offset; c < max_cert_offset;
263 c += cert_offset_step) {
266 search.ExtractBestPathAsLabels(&ocr_labels, &xcoords);
271 if ((r == min_dict_ratio && c == min_cert_offset) ||
272 !std::isfinite(word_error)) {
275 tprintf(
"r=%g, c=%g, truth=%s, ocr=%s, wderr=%g, truth[0]=%d\n", r, c,
276 t.c_str(), o.c_str(), word_error, truth_labels[0]);
278 results +=
" " + std::to_string(r);
279 results +=
"," + std::to_string(c);
280 results +=
"=" + std::to_string(word_error);
296 bool randomly_rotate) {
308 std::string &log_msg) {
330 std::vector<char> rec_model_data;
333 log_msg +=
" New best BCER = " + std::to_string(error_rate);
347 log_msg +=
" failed to write best model:";
349 log_msg +=
" wrote best model:";
352 log_msg += best_model_name;
356 log_msg +=
" New worst BCER = " + std::to_string(error_rate);
361 log_msg +=
"\nDivergence! ";
378 result = sub_trainer_result !=
STR_NONE;
382 std::vector<char> checkpoint;
385 log_msg +=
" failed to write checkpoint.";
387 log_msg +=
" wrote checkpoint.";
408 std::string &log_msg)
const {
409 log_msg += intro_str;
455 uint8_t amount = serialize_amount;
459 if (serialize_amount ==
LIGHT) {
492 std::vector<char> sub_data;
519 tprintf(
"Warning: LSTMTrainer deserialized an LSTMRecognizer!\n");
548 if (amount ==
LIGHT) {
581 std::vector<char> sub_data;
585 if (sub_data.empty()) {
608 log_msg +=
" Failed to revert to previous best for trial!";
611 log_msg +=
" Trial sub_trainer_ from iteration " +
636 double sub_margin = (training_error - sub_error) / sub_error;
638 log_msg +=
" sub_trainer=" + std::to_string(sub_error);
639 log_msg +=
" margin=" + std::to_string(100.0 * sub_margin);
643 while (
sub_trainer_->training_iteration() < end_iteration &&
645 int target_iteration =
647 while (
sub_trainer_->training_iteration() < target_iteration) {
650 std::string batch_log =
"Sub:";
653 tprintf(
"UpdateSubtrainer:%s", batch_log.c_str());
654 log_msg += batch_log;
656 sub_margin = (training_error - sub_error) / sub_error;
661 std::vector<char> updated_trainer;
664 log_msg +=
" Sub trainer wins at iteration " +
677 std::string &log_msg) {
682 "\nReduced learning rate on layers: " + std::to_string(num_reduced);
685 log_msg +=
"\nReduced learning rate to :" + std::to_string(
learning_rate_);
704 int num_layers = layers.size();
705 std::vector<int> num_weights(num_layers);
706 std::vector<TFloat> bad_sums[LR_COUNT];
707 std::vector<TFloat> ok_sums[LR_COUNT];
708 for (
int i = 0; i < LR_COUNT; ++i) {
709 bad_sums[i].resize(num_layers, 0.0);
710 ok_sums[i].resize(num_layers, 0.0);
712 auto momentum_factor = 1 / (1 -
momentum_);
713 std::vector<char> orig_trainer;
715 for (
int i = 0; i < num_layers; ++i) {
720 for (
int s = 0; s < num_samples; ++s) {
722 for (
int ww = 0; ww < LR_COUNT; ++ww) {
724 auto ww_factor = momentum_factor;
734 for (
int i = 0; i < num_layers; ++i) {
735 if (num_weights[i] == 0) {
745 if (trainingdata ==
nullptr) {
749 std::vector<char> updated_trainer;
751 for (
int i = 0; i < num_layers; ++i) {
752 if (num_weights[i] == 0) {
762 layer->
Update(0.0, 0.0, 0.0, 0);
766 float before_bad = bad_sums[ww][i];
767 float before_ok = ok_sums[ww][i];
769 &ok_sums[ww][i], &bad_sums[ww][i]);
771 bad_sums[ww][i] + ok_sums[ww][i] - before_bad - before_ok;
772 if (bad_frac > 0.0f) {
773 bad_frac = (bad_sums[ww][i] - before_bad) / bad_frac;
780 for (
int i = 0; i < num_layers; ++i) {
781 if (num_weights[i] == 0) {
786 TFloat total_down = bad_sums[LR_DOWN][i] + ok_sums[LR_DOWN][i];
787 TFloat total_same = bad_sums[LR_SAME][i] + ok_sums[LR_SAME][i];
788 TFloat frac_down = bad_sums[LR_DOWN][i] / total_down;
789 TFloat frac_same = bad_sums[LR_SAME][i] / total_same;
790 tprintf(
"Layer %d=%s: lr %g->%g%%, lr %g->%g%%", i, layer->
name().c_str(),
791 lr * factor, 100.0 * frac_down, lr, 100.0 * frac_same);
800 if (num_lowered == 0) {
802 for (
int i = 0; i < num_layers; ++i) {
803 if (num_weights[i] > 0) {
818 int null_char, std::vector<int> *labels) {
819 if (str.c_str() ==
nullptr || str.length() <= 0) {
820 tprintf(
"Empty truth string!\n");
824 std::vector<int> internal_labels;
830 if (unicharset.
encode_string(cleaned.c_str(),
true, &internal_labels,
nullptr,
833 for (
auto internal_label : internal_labels) {
834 if (recoder !=
nullptr) {
839 for (
int j = 0; j < len; ++j) {
840 labels->push_back(code(j));
851 labels->push_back(internal_label);
861 tprintf(
"Encoding of string failed! Failure bytes:");
862 while (err_index < cleaned.size()) {
863 tprintf(
" %x", cleaned[err_index++] & 0xff);
892#ifndef GRAPHICS_DISABLED
907 if (trainingdata ==
nullptr) {
908 tprintf(
"Null trainingdata.\n");
914 std::vector<int> truth_labels;
916 tprintf(
"Can't encode transcription: '%s' in language '%s'\n",
921 bool upside_down =
false;
931 for (
auto truth_label : truth_labels) {
936 std::reverse(truth_labels.begin(), truth_labels.end());
940 while (w < truth_labels.size() &&
944 if (w == truth_labels.size()) {
950 bool invert = trainingdata->
boxes().empty();
951 if (!
RecognizeLine(*trainingdata, invert ? 0.5f : 0.0f, debug, invert, upside_down,
952 &image_scale, &inputs, fwd_outputs)) {
960 tprintf(
"Compute simple targets failed for %s!\n",
964 }
else if (loss_type ==
LT_CTC) {
966 tprintf(
"Compute CTC targets failed for %s!\n",
971 tprintf(
"Logistic outputs not implemented yet!\n");
974 std::vector<int> ocr_labels;
975 std::vector<int> xcoords;
978 if (loss_type !=
LT_CTC) {
990 if (truth_text != ocr_text) {
1000 trainingdata->
page_number(), delta_error == 0.0 ?
"(Perfect)" :
"");
1002 if (delta_error == 0.0) {
1017 std::vector<char> *data)
const {
1025 const char *data,
int size) {
1027 tprintf(
"Warning: data size is 0 in LSTMTrainer::ReadLocalTrainingDump\n");
1031 fp.
Open(data, size);
1037 std::vector<char> recognizer_data;
1040 recognizer_data.size());
1056 std::string filename;
1061 filename +=
".checkpoint";
1079 std::vector<int> code_map(num_new_codes, -1);
1080 for (
int c = 0; c < num_new_codes; ++c) {
1084 for (
int uid = 0; uid <= num_new_unichars; ++uid) {
1088 while (code_index < length && codes(code_index) != c) {
1091 if (code_index == length) {
1096 uid < num_new_unichars
1098 : old_chset.
size() - 1;
1099 if (old_uid == INVALID_UNICHAR_ID) {
1104 if (code_index < old_recoder.
EncodeUnichar(old_uid, &old_codes)) {
1105 old_code = old_codes(code_index);
1109 code_map[c] = old_code;
1122 "Must provide a traineddata containing lstm_unicharset and"
1123 " lstm_recoder!\n" !=
nullptr);
1139#ifndef GRAPHICS_DISABLED
1158 const std::vector<int> &truth_labels,
1160 const std::string &truth_text =
DecodeLabels(truth_labels);
1161 if (truth_text.c_str() ==
nullptr || truth_text.length() <= 0) {
1162 tprintf(
"Empty truth string at decode time!\n");
1167 std::vector<int> labels;
1168 std::vector<int> xcoords;
1172 truth_text.c_str());
1173 if (truth_text != text) {
1178 tprintf(
"TRAINING activation path for truth string %s\n",
1179 truth_text.c_str());
1181#ifndef GRAPHICS_DISABLED
1193#ifndef GRAPHICS_DISABLED
1197 const char *window_name,
ScrollView **window) {
1198 int width = targets.
Width();
1202 for (
int c = 0; c < num_features; ++c) {
1206 for (
int t = 0; t < width; ++t) {
1207 double target = targets.
f(t)[c];
1211 (*window)->SetCursor(t - 1, 0);
1214 (*window)->DrawTo(t, target);
1215 }
else if (start_t >= 0) {
1216 (*window)->DrawTo(t, 0);
1217 (*window)->DrawTo(start_t - 1, 0);
1222 (*window)->DrawTo(width, 0);
1223 (*window)->DrawTo(start_t - 1, 0);
1226 (*window)->Update();
1234 const std::vector<int> &truth_labels,
1236 if (truth_labels.size() > targets->
Width()) {
1237 tprintf(
"Error: transcription %s too long to fit into target of width %d\n",
1242 for (
auto truth_label : truth_labels) {
1246 for (i = truth_labels.size(); i < targets->Width(); ++i) {
1267 double char_error,
double word_error) {
1287 double total_error = 0.0;
1288 int width = deltas.
Width();
1290 for (
int t = 0; t < width; ++t) {
1291 const float *class_errs = deltas.
f(t);
1292 for (
int c = 0; c < num_classes; ++c) {
1293 double error = class_errs[c];
1294 total_error += error * error;
1297 return sqrt(total_error / (width * num_classes));
1307 int width = deltas.
Width();
1309 for (
int t = 0; t < width; ++t) {
1310 const float *class_errs = deltas.
f(t);
1311 for (
int c = 0; c < num_classes; ++c) {
1312 float abs_delta = std::fabs(class_errs[c]);
1315 if (0.5 <= abs_delta) {
1320 return static_cast<double>(num_errors) / width;
1325 const std::vector<int> &ocr_str) {
1327 unsigned truth_size = 0;
1328 for (
auto ch : truth_str) {
1334 for (
auto ch : ocr_str) {
1339 unsigned char_errors = 0;
1340 for (
auto label_count : label_counts) {
1341 char_errors += abs(label_count);
1344 if (truth_size <= char_errors) {
1345 return (char_errors == 0) ? 0.0 : 1.0;
1347 return static_cast<double>(char_errors) / truth_size;
1353 std::string *ocr_str) {
1354 using StrMap = std::unordered_map<std::string, int, std::hash<std::string>>;
1355 std::vector<std::string> truth_words =
split(*truth_str,
' ');
1356 if (truth_words.empty()) {
1359 std::vector<std::string> ocr_words =
split(*ocr_str,
' ');
1361 for (
const auto &truth_word : truth_words) {
1362 std::string truth_word_string(truth_word.c_str());
1363 auto it = word_counts.find(truth_word_string);
1364 if (it == word_counts.end()) {
1365 word_counts.insert(std::make_pair(truth_word_string, 1));
1370 for (
const auto &ocr_word : ocr_words) {
1371 std::string ocr_word_string(ocr_word.c_str());
1372 auto it = word_counts.find(ocr_word_string);
1373 if (it == word_counts.end()) {
1374 word_counts.insert(std::make_pair(ocr_word_string, -1));
1379 int word_recall_errs = 0;
1380 for (
const auto &word_count : word_counts) {
1381 if (word_count.second > 0) {
1382 word_recall_errs += word_count.second;
1385 return static_cast<double>(word_recall_errs) / truth_words.size();
1396 double buffer_sum = 0.0;
1397 for (
int i = 0; i < mean_count; ++i) {
1400 double mean = buffer_sum / mean_count;
1415 tprintf(
"Mean rms=%g%%, delta=%g%%, train=%g%%(%g%%), skip ratio=%g%%\n",
1427 const std::vector<char> &model_data,
1463 double two_percent_more = error_rate + 2.0;
1470 tprintf(
"2 Percent improvement time=%d, best error was %g @ %d\n",
1475 if (tester !=
nullptr) {
1488 if (result.length() > 0) {
const double kLearningRateDecay
const double kImprovementFraction
const int kMinStartedErrorRate
void tprintf(const char *format,...)
int IntCastRounded(double x)
@ TESSDATA_LSTM_UNICHARSET
const double kSubTrainerMarginFraction
std::function< std::string(int, const double *, const TessdataManager &, int)> TestCallback
const int kErrorGraphInterval
constexpr size_t countof(T const (&)[N]) noexcept
bool SaveDataToFile(const GenericVector< char > &data, const char *filename)
LIST search(LIST list, void *key, int_compare is_equal)
const double kMinDivergenceRate
const int kNumAdjustmentIterations
const double kHighConfidence
const double kBestCheckpointFraction
const int kNumPagesPerBatch
const std::vector< std::string > split(const std::string &s, char c)
const int kMinStallIterations
const double kStageTransitionThreshold
bool LoadDataFromFile(const char *filename, GenericVector< char > *data)
const std::string & imagefilename() const
const std::string & transcription() const
const std::string & language() const
const std::vector< TBOX > & boxes() const
TESS_API bool LoadDocuments(const std::vector< std::string > &filenames, CachingStrategy cache_strategy, FileReader reader)
double SignedRand(double range)
void OpenWrite(std::vector< char > *data)
bool DeSerialize(std::string &data)
bool Serialize(const std::string &data)
bool Open(const char *filename, FileReader reader)
void OverwriteEntry(TessdataType type, const char *data, int size)
std::string VersionString() const
void SetVersionString(const std::string &v_str)
bool GetComponent(TessdataType type, TFile *fp)
bool SaveFile(const char *filename, FileWriter writer) const
bool Init(const char *data_file_name)
int EncodeUnichar(unsigned unichar_id, RecodedCharID *code) const
bool DeSerialize(TFile *fp)
bool encode_string(const char *str, bool give_up_on_failure, std::vector< UNICHAR_ID > *encoding, std::vector< char > *lengths, unsigned *encoded_length) const
bool has_special_codes() const
bool load_from_file(const char *const filename, bool skip_fragments)
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
static std::string CleanupString(const char *utf8_str)
void DebugActivationPath(const NetworkIO &outputs, const std::vector< int > &labels, const std::vector< int > &xcoords)
LossType OutputLossType() const
std::string DecodeLabels(const std::vector< int > &labels)
bool SimpleTextOutput() const
NetworkScratch scratch_space_
bool LoadCharsets(const TessdataManager *mgr)
void LabelsFromOutputs(const NetworkIO &outputs, std::vector< int > *labels, std::vector< int > *xcoords)
void RecognizeLine(const ImageData &image_data, float invert_threshold, bool debug, double worst_dict_cert, const TBOX &line_box, PointerVector< WERD_RES > *words, int lstm_choice_mode=0, int lstm_choice_amount=5)
void DisplayForward(const NetworkIO &inputs, const std::vector< int > &labels, const std::vector< int > &label_coords, const char *window_name, ScrollView **window)
void SetIteration(int iteration)
void ScaleLearningRate(double factor)
void ScaleLayerLearningRate(const std::string &id, double factor)
float learning_rate() const
int32_t training_iteration_
int training_iteration() const
int sample_iteration() const
std::vector< std::string > EnumerateLayers() const
float GetLayerLearningRate(const std::string &id) const
Network * GetLayer(const std::string &id) const
bool Serialize(const TessdataManager *mgr, TFile *fp) const
const UNICHARSET & GetUnicharset() const
int32_t sample_iteration_
bool DeSerialize(const TessdataManager *mgr, TFile *fp)
virtual int RemapOutputs(int old_no, const std::vector< int > &code_map)
const std::string & name() const
static void ClearWindow(bool tess_coords, const char *window_name, int width, int height, ScrollView **window)
virtual void SetEnableTraining(TrainingState state)
virtual bool Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch *scratch, NetworkIO *back_deltas)=0
virtual void Update(float learning_rate, float momentum, float adam_beta, int num_samples)
virtual void DebugWeights()=0
bool TestFlag(NetworkFlags flag) const
virtual std::string spec() const
virtual void CountAlternators(const Network &other, TFloat *same, TFloat *changed) const
void Resize(const NetworkIO &src, int num_features)
void SetActivations(int t, int label, float ok_score)
bool AnySuspiciousTruth(float confidence_thr) const
void SubtractAllFromFloat(const NetworkIO &src)
const GENERIC_2D_ARRAY< float > & float_array() const
void Decode(const NetworkIO &output, double dict_ratio, double cert_offset, double worst_dict_cert, const UNICHARSET *charset, int lstm_choice_mode=0)
void ExtractBestPathAsLabels(std::vector< int > *labels, std::vector< int > *xcoords) const
static constexpr float kMinCertainty
static bool ComputeCTCTargets(const std::vector< int > &truth_labels, int null_char, const GENERIC_2D_ARRAY< float > &outputs, NetworkIO *targets)
static void NormalizeProbs(NetworkIO *probs)
static bool InitNetwork(int num_outputs, const char *network_spec, int append_index, int net_flags, float weight_range, TRand *randomizer, Network **network)
bool TransitionTrainingStage(float error_threshold)
std::vector< int32_t > best_error_iterations_
std::vector< char > worst_model_data_
Trainability PrepareForBackward(const ImageData *trainingdata, NetworkIO *fwd_outputs, NetworkIO *targets)
bool ReadLocalTrainingDump(const TessdataManager *mgr, const char *data, int size)
std::string UpdateErrorGraph(int iteration, double error_rate, const std::vector< char > &model_data, const TestCallback &tester)
bool EncodeString(const std::string &str, std::vector< int > *labels) const
double error_rates_[ET_COUNT]
bool LoadAllTrainingData(const std::vector< std::string > &filenames, CachingStrategy cache_strategy, bool randomly_rotate)
double ComputeErrorRates(const NetworkIO &deltas, double char_error, double word_error)
int InitTensorFlowNetwork(const std::string &tf_proto)
double ComputeWordError(std::string *truth_str, std::string *ocr_str)
void ReduceLearningRates(LSTMTrainer *samples_trainer, std::string &log_msg)
double NewSingleError(ErrorTypes type) const
void PrepareLogMsg(std::string &log_msg) const
bool ComputeCTCTargets(const std::vector< int > &truth_labels, NetworkIO *outputs, NetworkIO *targets)
std::vector< char > best_trainer_
double worst_error_rates_[ET_COUNT]
bool MaintainCheckpoints(const TestCallback &tester, std::string &log_msg)
void SaveRecognitionDump(std::vector< char > *data) const
bool Serialize(SerializeAmount serialize_amount, const TessdataManager *mgr, TFile *fp) const
bool ComputeTextTargets(const NetworkIO &outputs, const std::vector< int > &truth_labels, NetworkIO *targets)
float error_rate_of_last_saved_best_
int last_perfect_training_iteration_
void FillErrorBuffer(double new_error, ErrorTypes type)
void LogIterations(const char *intro_str, std::string &log_msg) const
int learning_iteration() const
bool SaveTraineddata(const char *filename)
bool DeSerialize(const TessdataManager *mgr, TFile *fp)
double ComputeRMSError(const NetworkIO &deltas)
Trainability GridSearchDictParams(const ImageData *trainingdata, int iteration, double min_dict_ratio, double dict_ratio_step, double max_dict_ratio, double min_cert_offset, double cert_offset_step, double max_cert_offset, std::string &results)
double ComputeWinnerError(const NetworkIO &deltas)
std::string checkpoint_name_
bool InitNetwork(const char *network_spec, int append_index, int net_flags, float weight_range, float learning_rate, float momentum, float adam_beta)
void StartSubtrainer(std::string &log_msg)
SubTrainerResult UpdateSubtrainer(std::string &log_msg)
void UpdateErrorBuffer(double new_error, ErrorTypes type)
int32_t improvement_steps_
int CurrentTrainingStage() const
std::string DumpFilename() const
std::vector< char > best_model_data_
bool SaveTrainingDump(SerializeAmount serialize_amount, const LSTMTrainer &trainer, std::vector< char > *data) const
bool DebugLSTMTraining(const NetworkIO &inputs, const ImageData &trainingdata, const NetworkIO &fwd_outputs, const std::vector< int > &truth_labels, const NetworkIO &outputs)
DocumentCache training_data_
int checkpoint_iteration_
static const int kRollingBufferSize_
int prev_sample_iteration_
std::vector< double > error_buffers_[ET_COUNT]
std::unique_ptr< LSTMTrainer > sub_trainer_
double ComputeCharError(const std::vector< int > &truth_str, const std::vector< int > &ocr_str)
void DisplayTargets(const NetworkIO &targets, const char *window_name, ScrollView **window)
bool ReadTrainingDump(const std::vector< char > &data, LSTMTrainer &trainer) const
int ReduceLayerLearningRates(TFloat factor, int num_samples, LSTMTrainer *samples_trainer)
const ImageData * TrainOnLine(LSTMTrainer *samples_trainer, bool batch)
std::vector< double > best_error_history_
bool TryLoadingCheckpoint(const char *filename, const char *old_traineddata)
double best_error_rates_[ET_COUNT]
std::vector< int > MapRecoder(const UNICHARSET &old_chset, const UnicharCompress &old_recoder) const
SVEvent * AwaitEvent(SVEventType type)