14#include "gmock/gmock.h"
32TEST(LangModelTest, AddACharacter) {
33 constexpr char kTestString[] =
"Simple ASCII string to encode !@#$%&";
34 constexpr char kTestStringRupees[] =
"ASCII string with Rupee symbol ₹";
36 std::string script_dir = LANGDATA_DIR;
41 std::string version_str =
"TestVersion";
43 std::string output_dir = FLAGS_test_tmpdir;
44 LOG(
INFO) <<
"Output dir=" << output_dir <<
"\n";
45 std::string lang1 =
"eng";
46 bool pass_through_recoder =
false;
49 EXPECT_GT(words.size(), 0);
51 EXPECT_GT(puncs.size(), 0);
53 EXPECT_GT(numbers.size(), 0);
54 bool lang_is_rtl =
false;
56 EXPECT_EQ(0,
CombineLangModel(unicharset, script_dir, version_str, output_dir, lang1,
57 pass_through_recoder, words, puncs, numbers, lang_is_rtl,
nullptr,
60 std::string traineddata1 =
file::JoinPath(output_dir, lang1, lang1) +
".traineddata";
63 std::vector<int> labels1;
64 EXPECT_TRUE(trainer1.
EncodeString(kTestString, &labels1));
65 std::string test1_decoded = trainer1.
DecodeLabels(labels1);
66 std::string test1_str(&test1_decoded[0], test1_decoded.length());
67 LOG(
INFO) <<
"Labels1=" << test1_str <<
"\n";
70 int size_before = unicharset.
size();
73 EXPECT_EQ(size_before + 1, unicharset.
size());
75 std::string lang2 =
"extended";
76 EXPECT_EQ(EXIT_SUCCESS,
CombineLangModel(unicharset, script_dir, version_str, output_dir, lang2,
77 pass_through_recoder, words, puncs, numbers, lang_is_rtl,
80 std::string traineddata2 =
file::JoinPath(output_dir, lang2, lang2) +
".traineddata";
83 std::vector<int> labels2;
84 EXPECT_TRUE(trainer2.
EncodeString(kTestString, &labels2));
85 std::string test2_decoded = trainer2.
DecodeLabels(labels2);
86 std::string test2_str(&test2_decoded[0], test2_decoded.length());
87 LOG(
INFO) <<
"Labels2=" << test2_str <<
"\n";
89 std::vector<int> labels3;
90 EXPECT_TRUE(trainer2.
EncodeString(kTestStringRupees, &labels3));
91 std::string test3_decoded = trainer2.
DecodeLabels(labels3);
92 std::string test3_str(&test3_decoded[0], test3_decoded.length());
93 LOG(
INFO) <<
"labels3=" << test3_str <<
"\n";
100 EXPECT_EQ(null1 + 1, null2);
101 std::vector<int> labels1_v(labels1.size());
102 for (
unsigned i = 0; i < labels1.size(); ++i) {
103 if (labels1[i] == null1) {
104 labels1_v[i] = null2;
106 labels1_v[i] = labels1[i];
109 EXPECT_THAT(labels1_v, testing::ElementsAreArray(&labels2[0], labels2.size()));
112 EXPECT_FALSE(trainer1.
EncodeString(kTestStringRupees, &labels1));
113 EXPECT_TRUE(trainer2.
EncodeString(kTestStringRupees, &labels2));
117TEST(LangModelTest, AddACharacterHindi) {
118 constexpr char kTestString[] =
"हिन्दी में एक लाइन लिखें";
119 constexpr char kTestStringRupees[] =
"हिंदी में रूपये का चिन्ह प्रयोग करें ₹१००.००";
121 std::string script_dir = LANGDATA_DIR;
126 std::string version_str =
"TestVersion";
128 std::string output_dir = FLAGS_test_tmpdir;
129 LOG(
INFO) <<
"Output dir=" << output_dir <<
"\n";
130 std::string lang1 =
"hin";
131 bool pass_through_recoder =
false;
134 EXPECT_GT(words.size(), 0);
136 EXPECT_GT(puncs.size(), 0);
138 EXPECT_GT(numbers.size(), 0);
139 bool lang_is_rtl =
false;
141 EXPECT_EQ(0,
CombineLangModel(unicharset, script_dir, version_str, output_dir, lang1,
142 pass_through_recoder, words, puncs, numbers, lang_is_rtl,
nullptr,
145 std::string traineddata1 =
file::JoinPath(output_dir, lang1, lang1) +
".traineddata";
148 std::vector<int> labels1;
149 EXPECT_TRUE(trainer1.
EncodeString(kTestString, &labels1));
150 std::string test1_decoded = trainer1.
DecodeLabels(labels1);
151 std::string test1_str(&test1_decoded[0], test1_decoded.length());
152 LOG(
INFO) <<
"Labels1=" << test1_str <<
"\n";
155 int size_before = unicharset.
size();
158 EXPECT_EQ(size_before + 1, unicharset.
size());
160 std::string lang2 =
"extendedhin";
161 EXPECT_EQ(EXIT_SUCCESS,
CombineLangModel(unicharset, script_dir, version_str, output_dir, lang2,
162 pass_through_recoder, words, puncs, numbers, lang_is_rtl,
165 std::string traineddata2 =
file::JoinPath(output_dir, lang2, lang2) +
".traineddata";
168 std::vector<int> labels2;
169 EXPECT_TRUE(trainer2.
EncodeString(kTestString, &labels2));
170 std::string test2_decoded = trainer2.
DecodeLabels(labels2);
171 std::string test2_str(&test2_decoded[0], test2_decoded.length());
172 LOG(
INFO) <<
"Labels2=" << test2_str <<
"\n";
174 std::vector<int> labels3;
175 EXPECT_TRUE(trainer2.
EncodeString(kTestStringRupees, &labels3));
176 std::string test3_decoded = trainer2.
DecodeLabels(labels3);
177 std::string test3_str(&test3_decoded[0], test3_decoded.length());
178 LOG(
INFO) <<
"labels3=" << test3_str <<
"\n";
185 EXPECT_EQ(null1 + 1, null2);
186 std::vector<int> labels1_v(labels1.size());
187 for (
unsigned i = 0; i < labels1.size(); ++i) {
188 if (labels1[i] == null1) {
189 labels1_v[i] = null2;
191 labels1_v[i] = labels1[i];
194 EXPECT_THAT(labels1_v, testing::ElementsAreArray(&labels2[0], labels2.size()));
197 EXPECT_FALSE(trainer1.
EncodeString(kTestStringRupees, &labels1));
198 EXPECT_TRUE(trainer2.
EncodeString(kTestStringRupees, &labels2));
void SetupBasicProperties(bool report_errors, bool decompose, UNICHARSET *unicharset)
std::string TestDataNameToPath(const std::string &name)
std::string ReadFile(const std::string &filename, FileReader reader)
int CombineLangModel(const UNICHARSET &unicharset, const std::string &script_dir, const std::string &version_str, const std::string &output_dir, const std::string &lang, bool pass_through_recoder, const std::vector< std::string > &words, const std::vector< std::string > &puncs, const std::vector< std::string > &numbers, bool lang_is_rtl, FileReader reader, FileWriter writer)
const std::vector< std::string > split(const std::string &s, char c)
TEST(TesseractInstanceTest, TestMultipleTessInstances)
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
bool load_from_file(const char *const filename, bool skip_fragments)
std::string DecodeLabels(const std::vector< int > &labels)
bool EncodeString(const std::string &str, std::vector< int > *labels) const
bool InitCharSet(const std::string &traineddata_path)
static std::string JoinPath(const std::string &s1, const std::string &s2)