5#include <unordered_map>
9#include "unicode/uchar.h"
10#include "unicode/uscript.h"
41 const std::vector<char32> &src,
42 std::vector<std::vector<char32>> *dest) {
44 std::vector<std::vector<char32>> graphemes;
60 std::unique_ptr<Validator> validator(
ScriptValidator(script, report_errors));
61 for (
const auto &grapheme : graphemes) {
62 if (!validator->ValidateCleanAndSegmentInternal(g_mode, grapheme, dest)) {
73#define CASE(e, T) case ViramaScript::e: return std::make_unique<T>(script, report_errors)
80 return std::make_unique<ValidateIndic>(script, report_errors);
90 const std::vector<char32> &src,
91 std::vector<std::vector<char32>> *dest) {
110 dest->reserve(dest->size() +
output_.size());
112 dest->push_back({ch});
116 std::move(
parts_.begin(),
parts_.end(), std::back_inserter(*dest));
119 dest->push_back(std::vector<char32>());
124 dest->back().insert(dest->back().end(),
output_.begin(),
output_.end());
128static bool CmpPairSecond(
const std::pair<int, int> &p1,
const std::pair<int, int> &p2) {
129 return p1.second < p2.second;
136 std::unordered_map<int, int> histogram;
143 UScriptCode script_code = uscript_getScript(ch, err);
145 script_code == USCRIPT_MYANMAR) {
146 if (script_code == USCRIPT_MYANMAR) {
152 if (!histogram.empty()) {
153 int base = std::max_element(histogram.begin(), histogram.end(), CmpPairSecond)->first;
171 (unicode & 0x7f) == 0x4d) ||
179 return (0x1cd0 <= unicode && unicode < 0x1d00) || (0xa8e0 <= unicode && unicode <= 0xa8f7) ||
180 (0x951 <= unicode && unicode <= 0x954);
191 codes_.reserve(text.size());
static ViramaScript MostFrequentViramaScript(const std::vector< char32 > &utf32)
static const char32 kSinhalaVirama
static const char32 kZeroWidthNonJoiner
static const char32 kKhmerVirama
virtual CharClass UnicodeToCharClass(char32 ch) const =0
static const char32 kJavaneseVirama
std::vector< char32 > output_
static const char32 kInvalid
bool ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode, const std::vector< char32 > &src, std::vector< std::vector< char32 > > *dest)
static const char32 kRightToLeftMark
static bool IsVedicAccent(char32 unicode)
static std::unique_ptr< Validator > ScriptValidator(ViramaScript script, bool report_errors)
static const char32 kLeftToRightMark
static bool IsVirama(char32 unicode)
static const int kIndicCodePageSize
static const char32 kZeroWidthSpace
bool IsSubscriptScript() const
std::vector< IndicPair > codes_
void MoveResultsToDest(GraphemeNormMode g_mode, std::vector< std::vector< char32 > > *dest)
void ComputeClassCodes(const std::vector< char32 > &text)
static bool ValidateCleanAndSegment(GraphemeNormMode g_mode, bool report_errors, const std::vector< char32 > &src, std::vector< std::vector< char32 > > *dest)
static const char32 kZeroWidthJoiner
static const char32 kMaxJavaneseUnicode
static const char32 kMaxSinhalaUnicode
static const char32 kMyanmarVirama
static const char32 kMinIndicUnicode
std::vector< std::vector< char32 > > parts_
virtual bool ConsumeGraphemeIfValid()=0