36 "Normalization mode: 1=Combine graphemes, "
37 "2=Split graphemes, 3=Pure unicode");
43static void AddStringsToUnicharset(
const std::vector<std::string> &strings,
int norm_mode,
45 for (
const auto &
string : strings) {
46 std::vector<std::string> normalized;
49 true,
string.c_str(), &normalized)) {
50 for (
const std::string &normed : normalized) {
58 tprintf(
"Normalization failed for string '%s'\n",
string.c_str());
63static int Main(
int argc,
char **argv) {
66 for (
int arg = 1; arg < argc; ++arg) {
68 if (file_data.empty()) {
71 std::vector<std::string> texts;
73 false,
nullptr, &texts,
75 tprintf(
"Extracting unicharset from box file %s\n", argv[arg]);
77 tprintf(
"Extracting unicharset from plain text file %s\n", argv[arg]);
79 texts =
split(file_data,
'\n');
81 AddStringsToUnicharset(texts, FLAGS_norm_mode, &unicharset);
85 if (unicharset.
save_to_file(FLAGS_output_unicharset.c_str())) {
86 tprintf(
"Wrote unicharset file %s\n", FLAGS_output_unicharset.c_str());
88 tprintf(
"Cannot save unicharset file %s\n", FLAGS_output_unicharset.c_str());
96int main(
int argc,
char **argv) {
97 tesseract::CheckSharedLibraryVersion();
103 "Usage: %s [--output_unicharset filename] [--norm_mode mode]"
104 " box_or_text_file [...]\n",
106 tprintf(
"Where mode means:\n");
107 tprintf(
" 1=combine graphemes (use for Latin and other simple scripts)\n");
108 tprintf(
" 2=split graphemes (use for Indic/Khmer/Myanmar)\n");
109 tprintf(
" 3=pure unicode (use for Arabic/Hebrew/Thai/Tibetan)\n");
110 tprintf(
"Reads box or plain text files to extract the unicharset.\n");
113 return tesseract::Main(argc, argv);
#define INT_PARAM_FLAG(name, val, comment)
#define STRING_PARAM_FLAG(name, val, comment)
bool ReadMemBoxes(int target_page, bool skip_blanks, const char *box_data, bool continue_on_failure, std::vector< TBOX > *boxes, std::vector< std::string > *texts, std::vector< std::string > *box_texts, std::vector< int > *pages)
void ParseCommandLineFlags(const char *usage, int *argc, char ***argv, const bool remove_flags)
void tprintf(const char *format,...)
void SetupBasicProperties(bool report_errors, bool decompose, UNICHARSET *unicharset)
bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNormMode g_mode, bool report_errors, const char *str8, std::vector< std::string > *graphemes)
std::string ReadFile(const std::string &filename, FileReader reader)
const std::vector< std::string > split(const std::string &s, char c)
bool IsUTF8Whitespace(const char *text)
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
bool save_to_file(const char *const filename) const