tesseract 5.2.0
Loading...
Searching...
No Matches
tesseract::RecodeBeamTest Class Reference
Inheritance diagram for tesseract::RecodeBeamTest:

Protected Member Functions

void SetUp () override
 
 RecodeBeamTest ()
 
 ~RecodeBeamTest () override
 
void LoadUnicharset (const std::string &unicharset_name)
 
void LoadDict (const std::string &lang)
 
void ExpectCorrect (const GENERIC_2D_ARRAY< float > &output, const std::vector< int > &transcription)
 
void ExpectCorrect (const GENERIC_2D_ARRAY< float > &output, const std::string &truth_utf8, Dict *dict, PointerVector< WERD_RES > *words)
 
GENERIC_2D_ARRAY< float > GenerateRandomPaddedOutputs (const std::vector< int > &unichar_ids, int padding)
 
int EncodeUTF8 (const char *utf8_str, float score, int start_t, TRand *random, GENERIC_2D_ARRAY< float > *outputs)
 
GENERIC_2D_ARRAY< float > GenerateSyntheticOutputs (const char *chars1[], const float scores1[], const char *chars2[], const float scores2[], TRand *random)
 

Protected Attributes

UnicharCompress recoder_
 
int unichar_null_char_ = 0
 
int encoded_null_char_ = 0
 
CCUtil ccutil_
 
Dict lstm_dict_
 

Detailed Description

Definition at line 58 of file recodebeam_test.cc.

Constructor & Destructor Documentation

◆ RecodeBeamTest()

tesseract::RecodeBeamTest::RecodeBeamTest ( )
inlineprotected

◆ ~RecodeBeamTest()

tesseract::RecodeBeamTest::~RecodeBeamTest ( )
inlineoverrideprotected

Definition at line 66 of file recodebeam_test.cc.

66 {
68 }
void End()
Definition: dict.cpp:379

Member Function Documentation

◆ EncodeUTF8()

int tesseract::RecodeBeamTest::EncodeUTF8 ( const char *  utf8_str,
float  score,
int  start_t,
TRand random,
GENERIC_2D_ARRAY< float > *  outputs 
)
inlineprotected

Definition at line 244 of file recodebeam_test.cc.

245 {
246 int t = start_t;
247 std::vector<int> unichar_ids;
248 EXPECT_TRUE(ccutil_.unicharset.encode_string(utf8_str, true, &unichar_ids, nullptr, nullptr));
249 if (unichar_ids.empty() || utf8_str[0] == '\0') {
250 unichar_ids.clear();
251 unichar_ids.push_back(unichar_null_char_);
252 }
253 int num_ids = unichar_ids.size();
254 for (int u = 0; u < num_ids; ++u) {
255 RecodedCharID code;
256 int len = recoder_.EncodeUnichar(unichar_ids[u], &code);
257 EXPECT_NE(0, len);
258 for (int i = 0; i < len; ++i) {
259 // Apply the desired score.
260 (*outputs)(t++, code(i)) = score;
261 if (random != nullptr && t + (num_ids - u) * RecodedCharID::kMaxCodeLen < outputs->dim1()) {
262 int dups = static_cast<int>(random->UnsignedRand(3.0));
263 for (int d = 0; d < dups; ++d) {
264 // Duplicate the desired score.
265 (*outputs)(t++, code(i)) = score;
266 }
267 }
268 }
269 if (random != nullptr && t + (num_ids - u) * RecodedCharID::kMaxCodeLen < outputs->dim1()) {
270 int dups = static_cast<int>(random->UnsignedRand(3.0));
271 for (int d = 0; d < dups; ++d) {
272 // Add a random number of nulls as well.
273 (*outputs)(t++, encoded_null_char_) = score;
274 }
275 }
276 }
277 return t;
278 }
UNICHARSET unicharset
Definition: ccutil.h:61
static const int kMaxCodeLen
int EncodeUnichar(unsigned unichar_id, RecodedCharID *code) const
bool encode_string(const char *str, bool give_up_on_failure, std::vector< UNICHAR_ID > *encoding, std::vector< char > *lengths, unsigned *encoded_length) const
Definition: unicharset.cpp:239

◆ ExpectCorrect() [1/2]

void tesseract::RecodeBeamTest::ExpectCorrect ( const GENERIC_2D_ARRAY< float > &  output,
const std::string &  truth_utf8,
Dict dict,
PointerVector< WERD_RES > *  words 
)
inlineprotected

Definition at line 115 of file recodebeam_test.cc.

116 {
117 RecodeBeamSearch beam_search(recoder_, encoded_null_char_, false, dict);
118 beam_search.Decode(output, 3.5, -0.125, -25.0, nullptr);
119 // Uncomment and/or change nullptr above to &ccutil_.unicharset to debug:
120 // beam_search.DebugBeams(ccutil_.unicharset);
121 std::vector<int> labels, xcoords;
122 beam_search.ExtractBestPathAsLabels(&labels, &xcoords);
123 LOG(INFO) << "Labels size = " << labels.size() << " coords " << xcoords.size() << "\n";
124 // Now decode using recoder_.
125 std::string decoded;
126 int end = 1;
127 for (unsigned start = 0; start < labels.size(); start = end) {
128 RecodedCharID code;
129 unsigned index = start;
130 int uni_id = INVALID_UNICHAR_ID;
131 do {
132 code.Set(code.length(), labels[index++]);
133 uni_id = recoder_.DecodeUnichar(code);
134 } while (index < labels.size() && code.length() < RecodedCharID::kMaxCodeLen &&
135 (uni_id == INVALID_UNICHAR_ID || !recoder_.IsValidFirstCode(labels[index])));
136 EXPECT_NE(INVALID_UNICHAR_ID, uni_id) << "index=" << index << "/" << labels.size();
137 // To the extent of truth_utf8, we expect decoded to match, but if
138 // transcription is shorter, that is OK too, as we may just be testing
139 // that we get a valid sequence when padded with random data.
140 if (uni_id != unichar_null_char_ && decoded.size() < truth_utf8.size()) {
141 decoded += ccutil_.unicharset.id_to_unichar(uni_id);
142 }
143 end = index;
144 }
145 EXPECT_EQ(truth_utf8, decoded);
146
147 // Check that ExtractBestPathAsUnicharIds does the same thing.
148 std::vector<int> unichar_ids;
149 std::vector<float> certainties, ratings;
150 beam_search.ExtractBestPathAsUnicharIds(false, &ccutil_.unicharset, &unichar_ids, &certainties,
151 &ratings, &xcoords);
152 std::string u_decoded;
153 float total_rating = 0.0f;
154 for (unsigned u = 0; u < unichar_ids.size(); ++u) {
155 // To the extent of truth_utf8, we expect decoded to match, but if
156 // transcription is shorter, that is OK too, as we may just be testing
157 // that we get a valid sequence when padded with random data.
158 if (u_decoded.size() < truth_utf8.size()) {
159 const char *str = ccutil_.unicharset.id_to_unichar(unichar_ids[u]);
160 total_rating += ratings[u];
161 LOG(INFO) << u << ":u_id=" << unichar_ids[u] << "=" << str << ", c="
162 << certainties[u] << ", r=" << ratings[u] << "r_sum="
163 << total_rating << " @" << xcoords[u] << "\n";
164 if (str[0] == ' ') {
165 total_rating = 0.0f;
166 }
167 u_decoded += str;
168 }
169 }
170 EXPECT_EQ(truth_utf8, u_decoded);
171
172 // Check that ExtractBestPathAsWords does the same thing.
173 TBOX line_box(0, 0, 100, 10);
174 for (int i = 0; i < 2; ++i) {
175 beam_search.ExtractBestPathAsWords(line_box, 1.0f, false, &ccutil_.unicharset, words);
176 std::string w_decoded;
177 for (int w = 0; w < words->size(); ++w) {
178 const WERD_RES *word = (*words)[w];
179 if (w_decoded.size() < truth_utf8.size()) {
180 if (!w_decoded.empty() && word->word->space()) {
181 w_decoded += " ";
182 }
183 w_decoded += word->best_choice->unichar_string().c_str();
184 }
185 LOG(INFO) << "Word:" << w << " = " << word->best_choice->unichar_string()
186 << ", c=" << word->best_choice->certainty() << ", r=" << word->best_choice->rating()
187 << ", perm=" << word->best_choice->permuter() << "\n";
188 }
189 std::string w_trunc(w_decoded.data(), truth_utf8.size());
190 if (truth_utf8 != w_trunc) {
193 tesseract::GraphemeNorm::kNone, w_decoded.c_str(), &w_decoded);
194 w_trunc.assign(w_decoded.data(), truth_utf8.size());
195 }
196 EXPECT_EQ(truth_utf8, w_trunc);
197 }
198 }
@ INFO
Definition: log.h:28
@ TBOX
@ LOG
bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNorm grapheme_normalize, const char *str8, std::string *normalized)
Definition: normstrngs.cpp:152
bool IsValidFirstCode(int code) const
int DecodeUnichar(const RecodedCharID &code) const
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279

◆ ExpectCorrect() [2/2]

void tesseract::RecodeBeamTest::ExpectCorrect ( const GENERIC_2D_ARRAY< float > &  output,
const std::vector< int > &  transcription 
)
inlineprotected

Definition at line 105 of file recodebeam_test.cc.

106 {
107 // Get the utf8 string of the transcription.
108 std::string truth_utf8;
109 for (int i : transcription) {
110 truth_utf8 += ccutil_.unicharset.id_to_unichar(i);
111 }
112 PointerVector<WERD_RES> words;
113 ExpectCorrect(output, truth_utf8, nullptr, &words);
114 }
void ExpectCorrect(const GENERIC_2D_ARRAY< float > &output, const std::vector< int > &transcription)

◆ GenerateRandomPaddedOutputs()

GENERIC_2D_ARRAY< float > tesseract::RecodeBeamTest::GenerateRandomPaddedOutputs ( const std::vector< int > &  unichar_ids,
int  padding 
)
inlineprotected

Definition at line 201 of file recodebeam_test.cc.

202 {
203 int width = unichar_ids.size() * 2 * RecodedCharID::kMaxCodeLen;
204 int num_codes = recoder_.code_range();
205 GENERIC_2D_ARRAY<float> outputs(width + padding, num_codes, 0.0f);
206 // Fill with random data.
207 TRand random;
208 for (int t = 0; t < width; ++t) {
209 for (int i = 0; i < num_codes; ++i) {
210 outputs(t, i) = random.UnsignedRand(0.25);
211 }
212 }
213 int t = 0;
214 for (int unichar_id : unichar_ids) {
215 RecodedCharID code;
216 int len = recoder_.EncodeUnichar(unichar_id, &code);
217 EXPECT_NE(0, len);
218 for (int j = 0; j < len; ++j) {
219 // Make the desired answer a clear winner.
220 if (j > 0 && code(j) == code(j - 1)) {
221 // We will collapse adjacent equal codes so put a null in between.
222 outputs(t++, encoded_null_char_) = 1.0f;
223 }
224 outputs(t++, code(j)) = 1.0f;
225 }
226 // Put a 0 as a null char in between.
227 outputs(t++, encoded_null_char_) = 1.0f;
228 }
229 // Normalize the probs.
230 for (int t = 0; t < width; ++t) {
231 double sum = 0.0;
232 for (int i = 0; i < num_codes; ++i) {
233 sum += outputs(t, i);
234 }
235 for (int i = 0; i < num_codes; ++i) {
236 outputs(t, i) /= sum;
237 }
238 }
239
240 return outputs;
241 }

◆ GenerateSyntheticOutputs()

GENERIC_2D_ARRAY< float > tesseract::RecodeBeamTest::GenerateSyntheticOutputs ( const char *  chars1[],
const float  scores1[],
const char *  chars2[],
const float  scores2[],
TRand random 
)
inlineprotected

Definition at line 283 of file recodebeam_test.cc.

285 {
286 int width = 0;
287 while (chars1[width] != nullptr) {
288 ++width;
289 }
290 int padding = width * RecodedCharID::kMaxCodeLen;
291 int num_codes = recoder_.code_range();
292 GENERIC_2D_ARRAY<float> outputs(width + padding, num_codes, 0.0f);
293 int t = 0;
294 for (int i = 0; i < width; ++i) {
295 // In case there is overlap in the codes between 1st and 2nd choice, it
296 // is better to encode the 2nd choice first.
297 int end_t2 = EncodeUTF8(chars2[i], scores2[i], t, random, &outputs);
298 int end_t1 = EncodeUTF8(chars1[i], scores1[i], t, random, &outputs);
299 // Advance t to the max end, setting everything else to the leftovers.
300 int max_t = std::max(end_t1, end_t2);
301 while (t < max_t) {
302 double total_score = 0.0;
303 for (int j = 0; j < num_codes; ++j) {
304 total_score += outputs(t, j);
305 }
306 double null_remainder = (1.0 - total_score) / 2.0;
307 double remainder = null_remainder / (num_codes - 2);
308 if (outputs(t, encoded_null_char_) < null_remainder) {
309 outputs(t, encoded_null_char_) += null_remainder;
310 } else {
311 remainder += remainder;
312 }
313 for (int j = 0; j < num_codes; ++j) {
314 if (outputs(t, j) == 0.0f) {
315 outputs(t, j) = remainder;
316 }
317 }
318 ++t;
319 }
320 }
321 // Fill the rest with null chars.
322 while (t < width + padding) {
323 outputs(t++, encoded_null_char_) = 1.0f;
324 }
325 return outputs;
326 }
int EncodeUTF8(const char *utf8_str, float score, int start_t, TRand *random, GENERIC_2D_ARRAY< float > *outputs)

◆ LoadDict()

void tesseract::RecodeBeamTest::LoadDict ( const std::string &  lang)
inlineprotected

Definition at line 94 of file recodebeam_test.cc.

94 {
95 std::string traineddata_name = lang + ".traineddata";
96 std::string traineddata_file = file::JoinPath(TESTDATA_DIR, traineddata_name);
97 lstm_dict_.SetupForLoad(nullptr);
99 mgr.Init(traineddata_file.c_str());
100 lstm_dict_.LoadLSTM(lang.c_str(), &mgr);
102 }
bool Init(const char *data_file_name)
void LoadLSTM(const std::string &lang, TessdataManager *data_file)
Definition: dict.cpp:291
void SetupForLoad(DawgCache *dawg_cache)
Definition: dict.cpp:180
bool FinishLoad()
Definition: dict.cpp:357
static std::string JoinPath(const std::string &s1, const std::string &s2)
Definition: include_gunit.h:65

◆ LoadUnicharset()

void tesseract::RecodeBeamTest::LoadUnicharset ( const std::string &  unicharset_name)
inlineprotected

Definition at line 71 of file recodebeam_test.cc.

71 {
72 std::string radical_stroke_file = file::JoinPath(LANGDATA_DIR, "radical-stroke.txt");
73 std::string unicharset_file = file::JoinPath(TESTDATA_DIR, unicharset_name);
74 std::string radical_data;
75 CHECK_OK(file::GetContents(radical_stroke_file, &radical_data, file::Defaults()));
76 CHECK(ccutil_.unicharset.load_from_file(unicharset_file.c_str()));
79 std::string radical_str(radical_data.c_str());
81 RecodedCharID code;
83 encoded_null_char_ = code(0);
84 // Space should encode as itself.
86 EXPECT_EQ(UNICHAR_SPACE, code(0));
87 std::string output_name = file::JoinPath(FLAGS_test_tmpdir, "testenc.txt");
88 std::string encoding = recoder_.GetEncodingAsString(ccutil_.unicharset);
89 std::string encoding_str(&encoding[0], encoding.size());
90 CHECK_OK(file::SetContents(output_name, encoding_str, file::Defaults()));
91 LOG(INFO) << "Wrote encoding to:" << output_name << "\n";
92 }
#define CHECK(condition)
Definition: include_gunit.h:76
#define CHECK_OK(test)
Definition: include_gunit.h:84
@ UNICHAR_SPACE
Definition: unicharset.h:36
@ UNICHAR_BROKEN
Definition: unicharset.h:38
std::string GetEncodingAsString(const UNICHARSET &unicharset) const
bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, std::string *radical_stroke_table)
bool has_special_codes() const
Definition: unicharset.h:756
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:391
size_t size() const
Definition: unicharset.h:355
static int Defaults()
Definition: include_gunit.h:61
static bool SetContents(const std::string &name, const std::string &contents, bool)
Definition: include_gunit.h:56
static bool GetContents(const std::string &filename, std::string *out, int)
Definition: include_gunit.h:52

◆ SetUp()

void tesseract::RecodeBeamTest::SetUp ( )
inlineoverrideprotected

Definition at line 60 of file recodebeam_test.cc.

60 {
61 std::locale::global(std::locale(""));
63 }
static void MakeTmpdir()
Definition: include_gunit.h:38

Member Data Documentation

◆ ccutil_

CCUtil tesseract::RecodeBeamTest::ccutil_
protected

Definition at line 330 of file recodebeam_test.cc.

◆ encoded_null_char_

int tesseract::RecodeBeamTest::encoded_null_char_ = 0
protected

Definition at line 329 of file recodebeam_test.cc.

◆ lstm_dict_

Dict tesseract::RecodeBeamTest::lstm_dict_
protected

Definition at line 331 of file recodebeam_test.cc.

◆ recoder_

UnicharCompress tesseract::RecodeBeamTest::recoder_
protected

Definition at line 327 of file recodebeam_test.cc.

◆ unichar_null_char_

int tesseract::RecodeBeamTest::unichar_null_char_ = 0
protected

Definition at line 328 of file recodebeam_test.cc.


The documentation for this class was generated from the following file: