18#define _USE_MATH_DEFINES
1230 {6.895, 4.824, 3.984, 3.513, 3.206, 2.988, 2.823, 2.694, 2.590, 2.503}};
1236#define MINVARIANCE 0.0004
1244#define MINSAMPLESPERBUCKET 5
1245#define MINSAMPLES (MINBUCKETS * MINSAMPLESPERBUCKET)
1246#define MINSAMPLESNEEDED 1
1254#define BUCKETTABLESIZE 1024
1255#define NORMALEXTENT 3.0
1314#define Odd(N) ((N) % 2)
1315#define Mirror(N, R) ((R) - (N)-1)
1316#define Abs(N) (((N) < 0) ? (-(N)) : (N))
1326#define SqrtOf2Pi 2.506628275
1328static const double kNormalVariance =
1335#define LOOKUPTABLESIZE 8
1336#define MAXDEGREESOFFREEDOM MAXBUCKETS
1347static void CreateClusterTree(CLUSTERER *Clusterer);
1349static void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, int32_t Level);
1351static CLUSTER *FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster,
float *Distance);
1353static CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster);
1355static void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *
Config);
1357static PROTOTYPE *MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *
Config, CLUSTER *Cluster);
1359static PROTOTYPE *MakeDegenerateProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics,
1362static PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer, CLUSTERCONFIG *
Config, CLUSTER *Cluster,
1363 STATISTICS *Statistics);
1365static PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,
1368static PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer, CLUSTER *Cluster,
1369 STATISTICS *Statistics, BUCKETS *Buckets);
1371static PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,
1372 BUCKETS *NormalBuckets,
double Confidence);
1374static void MakeDimRandom(uint16_t i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc);
1376static void MakeDimUniform(uint16_t i, PROTOTYPE *Proto, STATISTICS *Statistics);
1378static STATISTICS *ComputeStatistics(int16_t N, PARAM_DESC ParamDesc[], CLUSTER *Cluster);
1380static PROTOTYPE *NewSphericalProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics);
1382static PROTOTYPE *NewEllipticalProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics);
1384static PROTOTYPE *NewMixedProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics);
1386static PROTOTYPE *NewSimpleProto(int16_t N, CLUSTER *Cluster);
1388static bool Independent(PARAM_DESC *ParamDesc, int16_t N,
float *CoVariance,
float Independence);
1390static BUCKETS *GetBuckets(CLUSTERER *clusterer,
DISTRIBUTION Distribution, uint32_t SampleCount,
1393static BUCKETS *MakeBuckets(
DISTRIBUTION Distribution, uint32_t SampleCount,
double Confidence);
1395static uint16_t OptimumNumberOfBuckets(uint32_t SampleCount);
1397static double ComputeChiSquared(uint16_t DegreesOfFreedom,
double Alpha);
1399static double NormalDensity(int32_t x);
1401static double UniformDensity(int32_t x);
1403static double Integral(
double f1,
double f2,
double Dx);
1405static void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uint16_t Dim, PARAM_DESC *ParamDesc,
1406 float Mean,
float StdDev);
1408static uint16_t NormalBucket(PARAM_DESC *ParamDesc,
float x,
float Mean,
float StdDev);
1410static uint16_t UniformBucket(PARAM_DESC *ParamDesc,
float x,
float Mean,
float StdDev);
1412static bool DistributionOK(BUCKETS *Buckets);
1414static uint16_t DegreesOfFreedom(
DISTRIBUTION Distribution, uint16_t HistogramBuckets);
1416static void AdjustBuckets(BUCKETS *Buckets, uint32_t NewSampleCount);
1418static void InitBuckets(BUCKETS *Buckets);
1420static int AlphaMatch(
void *arg1,
1423static double Solve(
SOLVEFUNC Function,
void *FunctionParams,
double InitialGuess,
double Accuracy);
1425static double ChiArea(CHISTRUCT *ChiParams,
double x);
1427static bool MultipleCharSamples(CLUSTERER *Clusterer, CLUSTER *Cluster,
float MaxIllegal);
1429static double InvertMatrix(
const float *input,
int size,
float *inv);
1446 Clusterer->NumberOfSamples = 0;
1447 Clusterer->NumChar = 0;
1450 Clusterer->Root =
nullptr;
1454 Clusterer->ParamDesc =
new PARAM_DESC[SampleSize];
1455 for (i = 0; i < SampleSize; i++) {
1457 Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].
NonEssential;
1458 Clusterer->ParamDesc[i].Min = ParamDesc[i].
Min;
1459 Clusterer->ParamDesc[i].Max = ParamDesc[i].
Max;
1460 Clusterer->ParamDesc[i].Range = ParamDesc[i].
Max - ParamDesc[i].
Min;
1461 Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2;
1462 Clusterer->ParamDesc[i].MidRange = (ParamDesc[i].
Max + ParamDesc[i].
Min) / 2;
1466 Clusterer->KDTree =
MakeKDTree(SampleSize, ParamDesc);
1469 for (
auto &d : Clusterer->bucket_cache) {
1500 Sample->Clustered =
false;
1501 Sample->Prototype =
false;
1502 Sample->SampleCount = 1;
1503 Sample->Left =
nullptr;
1504 Sample->Right =
nullptr;
1505 Sample->CharID = CharID;
1507 for (i = 0; i < Clusterer->
SampleSize; i++) {
1508 Sample->Mean[i] = Feature[i];
1514 if (CharID >= Clusterer->
NumChar) {
1515 Clusterer->
NumChar = CharID + 1;
1545 if (Clusterer->
Root ==
nullptr) {
1546 CreateClusterTree(Clusterer);
1554 ComputePrototypes(Clusterer,
Config);
1576 if (Clusterer !=
nullptr) {
1578 delete Clusterer->
KDTree;
1579 delete Clusterer->
Root;
1609 auto *Prototype =
static_cast<PROTOTYPE *
>(arg);
1612 if (Prototype->Cluster !=
nullptr) {
1618 delete[] Prototype->Variance.Elliptical;
1619 delete[] Prototype->Magnitude.Elliptical;
1620 delete[] Prototype->Weight.Elliptical;
1644 Cluster =
reinterpret_cast<CLUSTER *
>((*SearchState)->first_node());
1645 *SearchState =
pop(*SearchState);
1647 if (
Cluster->Left ==
nullptr) {
1650 *SearchState =
push(*SearchState,
Cluster->Right);
1663 return (Proto->
Mean[Dimension]);
1674 switch (Proto->
Style) {
1680 switch (Proto->
Distrib[Dimension]) {
1709static void CreateClusterTree(CLUSTERER *Clusterer) {
1710 ClusteringContext context;
1715 context.tree = Clusterer->KDTree;
1716 context.candidates =
new TEMPCLUSTER[Clusterer->NumberOfSamples];
1718 context.heap =
new ClusterHeap(Clusterer->NumberOfSamples);
1719 KDWalk(context.tree,
reinterpret_cast<void_proc>(MakePotentialClusters), &context);
1722 while (context.heap->Pop(&HeapEntry)) {
1723 TEMPCLUSTER *PotentialCluster = HeapEntry.data();
1727 if (PotentialCluster->Cluster->Clustered) {
1733 else if (PotentialCluster->Neighbor->Clustered) {
1734 PotentialCluster->Neighbor =
1735 FindNearestNeighbor(context.tree, PotentialCluster->Cluster, &HeapEntry.key());
1736 if (PotentialCluster->Neighbor !=
nullptr) {
1737 context.heap->Push(&HeapEntry);
1743 PotentialCluster->Cluster = MakeNewCluster(Clusterer, PotentialCluster);
1744 PotentialCluster->Neighbor =
1745 FindNearestNeighbor(context.tree, PotentialCluster->Cluster, &HeapEntry.key());
1746 if (PotentialCluster->Neighbor !=
nullptr) {
1747 context.heap->Push(&HeapEntry);
1753 Clusterer->Root =
static_cast<CLUSTER *
> RootOf(Clusterer->KDTree);
1756 delete context.tree;
1757 Clusterer->KDTree =
nullptr;
1758 delete context.heap;
1759 delete[] context.candidates;
1771static void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, int32_t ) {
1773 int next = context->next;
1774 context->candidates[next].Cluster = Cluster;
1775 HeapEntry.data() = &(context->candidates[next]);
1776 context->candidates[next].Neighbor =
1777 FindNearestNeighbor(context->tree, context->candidates[next].Cluster, &HeapEntry.key());
1778 if (context->candidates[next].Neighbor !=
nullptr) {
1779 context->heap->Push(&HeapEntry);
1797static CLUSTER *FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster,
float *Distance)
1798#define MAXNEIGHBORS 2
1799#define MAXDISTANCE FLT_MAX
1803 int NumberOfNeighbors;
1805 CLUSTER *BestNeighbor;
1809 reinterpret_cast<void **
>(Neighbor), Dist);
1813 BestNeighbor =
nullptr;
1814 for (i = 0; i < NumberOfNeighbors; i++) {
1815 if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) {
1816 *Distance = Dist[i];
1817 BestNeighbor = Neighbor[i];
1820 return BestNeighbor;
1832static CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster) {
1834 auto Cluster =
new CLUSTER(Clusterer->SampleSize);
1835 Cluster->Clustered =
false;
1836 Cluster->Prototype =
false;
1837 Cluster->Left = TempCluster->Cluster;
1838 Cluster->Right = TempCluster->Neighbor;
1839 Cluster->CharID = -1;
1842 Cluster->Left->Clustered =
true;
1843 Cluster->Right->Clustered =
true;
1844 KDDelete(Clusterer->KDTree, &Cluster->Left->Mean[0], Cluster->Left);
1845 KDDelete(Clusterer->KDTree, &Cluster->Right->Mean[0], Cluster->Right);
1848 Cluster->SampleCount =
MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc,
1849 Cluster->Left->SampleCount, Cluster->Right->SampleCount,
1850 &Cluster->Mean[0], &Cluster->Left->Mean[0], &Cluster->Right->Mean[0]);
1853 KDStore(Clusterer->KDTree, &Cluster->Mean[0], Cluster);
1871 float m1[],
float m2[]) {
1875 for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {
1880 if ((*m2 - *m1) > ParamDesc->
HalfRange) {
1881 *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->
Range)) / n;
1882 if (*m < ParamDesc->Min) {
1883 *m += ParamDesc->
Range;
1885 }
else if ((*m1 - *m2) > ParamDesc->
HalfRange) {
1886 *m = (n1 * (*m1 - ParamDesc->
Range) + n2 * *m2) / n;
1887 if (*m < ParamDesc->Min) {
1888 *m += ParamDesc->
Range;
1891 *m = (n1 * *m1 + n2 * *m2) / n;
1894 *m = (n1 * *m1 + n2 * *m2) / n;
1908static void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *
Config) {
1911 PROTOTYPE *Prototype;
1915 if (Clusterer->Root !=
nullptr) {
1924 Cluster =
reinterpret_cast<CLUSTER *
>(ClusterStack->first_node());
1925 ClusterStack =
pop(ClusterStack);
1926 Prototype = MakePrototype(Clusterer,
Config, Cluster);
1927 if (Prototype !=
nullptr) {
1928 Clusterer->ProtoList =
push(Clusterer->ProtoList, Prototype);
1930 ClusterStack =
push(ClusterStack, Cluster->Right);
1931 ClusterStack =
push(ClusterStack, Cluster->Left);
1951static PROTOTYPE *MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *
Config, CLUSTER *Cluster) {
1961 auto Statistics = ComputeStatistics(Clusterer->SampleSize, Clusterer->ParamDesc, Cluster);
1966 Proto = MakeDegenerateProto(Clusterer->SampleSize, Cluster, Statistics,
Config->
ProtoStyle,
1968 if (Proto !=
nullptr) {
1973 if (!Independent(Clusterer->ParamDesc, Clusterer->SampleSize, &Statistics->CoVariance[0],
1980 Proto = TestEllipticalProto(Clusterer,
Config, Cluster, Statistics);
1981 if (Proto !=
nullptr) {
1993 Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
1996 Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
1999 Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,
Config->
Confidence);
2002 Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
2003 if (Proto !=
nullptr) {
2006 Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
2007 if (Proto !=
nullptr) {
2010 Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,
Config->
Confidence);
2036static PROTOTYPE *MakeDegenerateProto(
2037 uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics,
PROTOSTYLE Style, int32_t MinSamples) {
2038 PROTOTYPE *Proto =
nullptr;
2044 if (Cluster->SampleCount < MinSamples) {
2047 Proto = NewSphericalProto(N, Cluster, Statistics);
2051 Proto = NewEllipticalProto(N, Cluster, Statistics);
2054 Proto = NewMixedProto(N, Cluster, Statistics);
2057 Proto->Significant =
false;
2075static PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer, CLUSTERCONFIG *
Config, CLUSTER *Cluster,
2076 STATISTICS *Statistics) {
2082 const double kMagicSampleMargin = 0.0625;
2083 const double kFTableBoostMargin = 2.0;
2085 int N = Clusterer->SampleSize;
2086 CLUSTER *Left = Cluster->Left;
2087 CLUSTER *Right = Cluster->Right;
2088 if (Left ==
nullptr || Right ==
nullptr) {
2091 int TotalDims = Left->SampleCount + Right->SampleCount;
2092 if (TotalDims < N + 1 || TotalDims < 2) {
2095 std::vector<float> Covariance(
static_cast<size_t>(N) * N);
2096 std::vector<float> Inverse(
static_cast<size_t>(N) * N);
2097 std::vector<float> Delta(N);
2099 for (
int i = 0; i < N; ++i) {
2100 int row_offset = i * N;
2101 if (!Clusterer->ParamDesc[i].NonEssential) {
2102 for (
int j = 0; j < N; ++j) {
2103 if (!Clusterer->ParamDesc[j].NonEssential) {
2104 Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset];
2106 Covariance[j + row_offset] = 0.0f;
2110 for (
int j = 0; j < N; ++j) {
2112 Covariance[j + row_offset] = 1.0f;
2114 Covariance[j + row_offset] = 0.0f;
2119 double err = InvertMatrix(&Covariance[0], N, &Inverse[0]);
2121 tprintf(
"Clustering error: Matrix inverse failed with error %g\n", err);
2124 for (
int dim = 0; dim < N; ++dim) {
2125 if (!Clusterer->ParamDesc[dim].NonEssential) {
2126 Delta[dim] = Left->Mean[dim] - Right->Mean[dim];
2134 for (
int x = 0; x < N; ++x) {
2136 for (
int y = 0; y < N; ++y) {
2137 temp +=
static_cast<double>(Inverse[y + N * x]) * Delta[y];
2139 Tsq += Delta[x] * temp;
2145 double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2) * EssentialN);
2146 int Fx = EssentialN;
2151 int Fy = TotalDims - EssentialN - 1;
2156 double FTarget =
FTable[Fy][Fx];
2160 FTarget += kFTableBoostMargin;
2163 return NewEllipticalProto(Clusterer->SampleSize, Cluster, Statistics);
2179static PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,
2181 PROTOTYPE *Proto =
nullptr;
2185 for (i = 0; i < Clusterer->SampleSize; i++) {
2186 if (Clusterer->ParamDesc[i].NonEssential) {
2190 FillBuckets(Buckets, Cluster, i, &(Clusterer->ParamDesc[i]), Cluster->Mean[i],
2191 sqrt(
static_cast<double>(Statistics->AvgVariance)));
2192 if (!DistributionOK(Buckets)) {
2197 if (i >= Clusterer->SampleSize) {
2198 Proto = NewSphericalProto(Clusterer->SampleSize, Cluster, Statistics);
2214static PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer, CLUSTER *Cluster,
2215 STATISTICS *Statistics, BUCKETS *Buckets) {
2216 PROTOTYPE *Proto =
nullptr;
2220 for (i = 0; i < Clusterer->SampleSize; i++) {
2221 if (Clusterer->ParamDesc[i].NonEssential) {
2225 FillBuckets(Buckets, Cluster, i, &(Clusterer->ParamDesc[i]), Cluster->Mean[i],
2226 sqrt(
static_cast<double>(Statistics->CoVariance[i * (Clusterer->SampleSize + 1)])));
2227 if (!DistributionOK(Buckets)) {
2232 if (i >= Clusterer->SampleSize) {
2233 Proto = NewEllipticalProto(Clusterer->SampleSize, Cluster, Statistics);
2253static PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,
2254 BUCKETS *NormalBuckets,
double Confidence) {
2257 BUCKETS *UniformBuckets =
nullptr;
2258 BUCKETS *RandomBuckets =
nullptr;
2261 Proto = NewMixedProto(Clusterer->SampleSize, Cluster, Statistics);
2264 for (i = 0; i < Clusterer->SampleSize; i++) {
2265 if (Clusterer->ParamDesc[i].NonEssential) {
2269 FillBuckets(NormalBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), Proto->Mean[i],
2270 std::sqrt(Proto->Variance.Elliptical[i]));
2271 if (DistributionOK(NormalBuckets)) {
2275 if (RandomBuckets ==
nullptr) {
2276 RandomBuckets = GetBuckets(Clusterer,
D_random, Cluster->SampleCount, Confidence);
2278 MakeDimRandom(i, Proto, &(Clusterer->ParamDesc[i]));
2279 FillBuckets(RandomBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), Proto->Mean[i],
2280 Proto->Variance.Elliptical[i]);
2281 if (DistributionOK(RandomBuckets)) {
2285 if (UniformBuckets ==
nullptr) {
2286 UniformBuckets = GetBuckets(Clusterer,
uniform, Cluster->SampleCount, Confidence);
2288 MakeDimUniform(i, Proto, Statistics);
2289 FillBuckets(UniformBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), Proto->Mean[i],
2290 Proto->Variance.Elliptical[i]);
2291 if (DistributionOK(UniformBuckets)) {
2297 if (i < Clusterer->SampleSize) {
2311static void MakeDimRandom(uint16_t i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc) {
2313 Proto->Mean[i] = ParamDesc->MidRange;
2314 Proto->Variance.Elliptical[i] = ParamDesc->HalfRange;
2317 Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
2318 Proto->Magnitude.Elliptical[i] = 1.0 / ParamDesc->Range;
2319 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
2320 Proto->LogMagnitude = log(
static_cast<double>(Proto->TotalMagnitude));
2332static void MakeDimUniform(uint16_t i, PROTOTYPE *Proto, STATISTICS *Statistics) {
2334 Proto->Mean[i] = Proto->Cluster->Mean[i] + (Statistics->Min[i] + Statistics->Max[i]) / 2;
2335 Proto->Variance.Elliptical[i] = (Statistics->Max[i] - Statistics->Min[i]) / 2;
2336 if (Proto->Variance.Elliptical[i] <
MINVARIANCE) {
2341 Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
2342 Proto->Magnitude.Elliptical[i] = 1.0 / (2.0 * Proto->Variance.Elliptical[i]);
2343 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
2344 Proto->LogMagnitude = log(
static_cast<double>(Proto->TotalMagnitude));
2363static STATISTICS *ComputeStatistics(int16_t N, PARAM_DESC ParamDesc[], CLUSTER *Cluster) {
2367 uint32_t SampleCountAdjustedForBias;
2370 auto Statistics =
new STATISTICS(N);
2373 std::vector<float> Distance(N);
2377 while ((Sample =
NextSample(&SearchState)) !=
nullptr) {
2378 for (i = 0; i < N; i++) {
2379 Distance[i] = Sample->Mean[i] - Cluster->Mean[i];
2380 if (ParamDesc[i].Circular) {
2381 if (Distance[i] > ParamDesc[i].HalfRange) {
2382 Distance[i] -= ParamDesc[i].Range;
2384 if (Distance[i] < -ParamDesc[i].HalfRange) {
2385 Distance[i] += ParamDesc[i].Range;
2388 if (Distance[i] < Statistics->Min[i]) {
2389 Statistics->Min[i] = Distance[i];
2391 if (Distance[i] > Statistics->Max[i]) {
2392 Statistics->Max[i] = Distance[i];
2395 auto CoVariance = &Statistics->CoVariance[0];
2396 for (i = 0; i < N; i++) {
2397 for (j = 0; j < N; j++, CoVariance++) {
2398 *CoVariance += Distance[i] * Distance[j];
2406 if (Cluster->SampleCount > 1) {
2407 SampleCountAdjustedForBias = Cluster->SampleCount - 1;
2409 SampleCountAdjustedForBias = 1;
2411 auto CoVariance = &Statistics->CoVariance[0];
2412 for (i = 0; i < N; i++) {
2413 for (j = 0; j < N; j++, CoVariance++) {
2414 *CoVariance /= SampleCountAdjustedForBias;
2419 Statistics->AvgVariance *= *CoVariance;
2423 Statistics->AvgVariance =
2424 static_cast<float>(pow(
static_cast<double>(Statistics->AvgVariance), 1.0 / N));
2440static PROTOTYPE *NewSphericalProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics) {
2443 Proto = NewSimpleProto(N, Cluster);
2445 Proto->Variance.Spherical = Statistics->AvgVariance;
2450 Proto->Magnitude.Spherical = 1.0 / sqrt(2.0 * M_PI * Proto->Variance.Spherical);
2451 Proto->TotalMagnitude =
static_cast<float>(
2452 pow(
static_cast<double>(Proto->Magnitude.Spherical),
static_cast<double>(N)));
2453 Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
2454 Proto->LogMagnitude = log(
static_cast<double>(Proto->TotalMagnitude));
2469static PROTOTYPE *NewEllipticalProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics) {
2473 Proto = NewSimpleProto(N, Cluster);
2474 Proto->Variance.Elliptical =
new float[N];
2475 Proto->Magnitude.Elliptical =
new float[N];
2476 Proto->Weight.Elliptical =
new float[N];
2478 auto CoVariance = &Statistics->CoVariance[0];
2479 Proto->TotalMagnitude = 1.0;
2480 for (i = 0; i < N; i++, CoVariance += N + 1) {
2481 Proto->Variance.Elliptical[i] = *CoVariance;
2482 if (Proto->Variance.Elliptical[i] <
MINVARIANCE) {
2486 Proto->Magnitude.Elliptical[i] = 1.0f / sqrt(2.0f * M_PI * Proto->Variance.Elliptical[i]);
2487 Proto->Weight.Elliptical[i] = 1.0f / Proto->Variance.Elliptical[i];
2488 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
2490 Proto->LogMagnitude = log(
static_cast<double>(Proto->TotalMagnitude));
2508static PROTOTYPE *NewMixedProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics) {
2509 auto Proto = NewEllipticalProto(N, Cluster, Statistics);
2510 Proto->Distrib.clear();
2511 Proto->Distrib.resize(N,
normal);
2512 Proto->Style =
mixed;
2524static PROTOTYPE *NewSimpleProto(int16_t N, CLUSTER *Cluster) {
2525 auto Proto =
new PROTOTYPE;
2527 Proto->Mean = Cluster->Mean;
2528 Proto->Distrib.clear();
2529 Proto->Significant =
true;
2530 Proto->Merged =
false;
2532 Proto->NumSamples = Cluster->SampleCount;
2533 Proto->Cluster = Cluster;
2534 Proto->Cluster->Prototype =
true;
2556static bool Independent(PARAM_DESC *ParamDesc, int16_t N,
float *CoVariance,
float Independence) {
2560 float CorrelationCoeff;
2563 for (i = 0; i < N; i++, VARii += N + 1) {
2564 if (ParamDesc[i].NonEssential) {
2568 VARjj = VARii + N + 1;
2569 CoVariance = VARii + 1;
2570 for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) {
2571 if (ParamDesc[j].NonEssential) {
2575 if ((*VARii == 0.0) || (*VARjj == 0.0)) {
2576 CorrelationCoeff = 0.0;
2578 CorrelationCoeff = sqrt(std::sqrt(*CoVariance * *CoVariance / (*VARii * *VARjj)));
2580 if (CorrelationCoeff > Independence) {
2603static BUCKETS *GetBuckets(CLUSTERER *clusterer,
DISTRIBUTION Distribution, uint32_t SampleCount,
2604 double Confidence) {
2606 uint16_t NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
2607 BUCKETS *Buckets = clusterer->bucket_cache[Distribution][NumberOfBuckets -
MINBUCKETS];
2610 if (Buckets ==
nullptr) {
2611 Buckets = MakeBuckets(Distribution, SampleCount, Confidence);
2612 clusterer->bucket_cache[Distribution][NumberOfBuckets -
MINBUCKETS] = Buckets;
2615 if (SampleCount != Buckets->SampleCount) {
2616 AdjustBuckets(Buckets, SampleCount);
2618 if (Confidence != Buckets->Confidence) {
2619 Buckets->Confidence = Confidence;
2620 Buckets->ChiSquared =
2621 ComputeChiSquared(DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence);
2623 InitBuckets(Buckets);
2644static BUCKETS *MakeBuckets(
DISTRIBUTION Distribution, uint32_t SampleCount,
double Confidence) {
2645 const DENSITYFUNC DensityFunction[] = {NormalDensity, UniformDensity, UniformDensity};
2647 double BucketProbability;
2648 double NextBucketBoundary;
2650 double ProbabilityDelta;
2651 double LastProbDensity;
2653 uint16_t CurrentBucket;
2657 auto Buckets =
new BUCKETS(OptimumNumberOfBuckets(SampleCount));
2658 Buckets->SampleCount = SampleCount;
2659 Buckets->Confidence = Confidence;
2662 Buckets->Distribution = Distribution;
2666 Buckets->ChiSquared =
2667 ComputeChiSquared(DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence);
2671 BucketProbability = 1.0 /
static_cast<double>(Buckets->NumberOfBuckets);
2674 CurrentBucket = Buckets->NumberOfBuckets / 2;
2675 if (
Odd(Buckets->NumberOfBuckets)) {
2676 NextBucketBoundary = BucketProbability / 2;
2678 NextBucketBoundary = BucketProbability;
2682 LastProbDensity = (*DensityFunction[
static_cast<int>(Distribution)])(
BUCKETTABLESIZE / 2);
2684 ProbDensity = (*DensityFunction[
static_cast<int>(Distribution)])(i + 1);
2685 ProbabilityDelta = Integral(LastProbDensity, ProbDensity, 1.0);
2686 Probability += ProbabilityDelta;
2687 if (Probability > NextBucketBoundary) {
2688 if (CurrentBucket < Buckets->NumberOfBuckets - 1) {
2691 NextBucketBoundary += BucketProbability;
2693 Buckets->Bucket[i] = CurrentBucket;
2694 Buckets->ExpectedCount[CurrentBucket] +=
static_cast<float>(ProbabilityDelta * SampleCount);
2695 LastProbDensity = ProbDensity;
2698 Buckets->ExpectedCount[CurrentBucket] +=
static_cast<float>((0.5 - Probability) * SampleCount);
2702 Buckets->Bucket[i] =
Mirror(Buckets->Bucket[j], Buckets->NumberOfBuckets);
2706 for (i = 0, j = Buckets->NumberOfBuckets - 1; i <= j; i++, j--) {
2707 Buckets->ExpectedCount[i] += Buckets->ExpectedCount[j];
2726static uint16_t OptimumNumberOfBuckets(uint32_t SampleCount) {
2730 if (SampleCount < kCountTable[0]) {
2731 return kBucketsTable[0];
2735 if (SampleCount <= kCountTable[Next]) {
2736 Slope =
static_cast<float>(kBucketsTable[Next] - kBucketsTable[Last]) /
2737 static_cast<float>(kCountTable[Next] - kCountTable[Last]);
2739 static_cast<uint16_t
>(kBucketsTable[Last] + Slope * (SampleCount - kCountTable[Last])));
2742 return kBucketsTable[Last];
2761static double ComputeChiSquared(uint16_t DegreesOfFreedom,
double Alpha)
2762#define CHIACCURACY 0.01
2763#define MINALPHA (1e-200)
2770 if (
Odd(DegreesOfFreedom)) {
2777 CHISTRUCT SearchKey(0.0, Alpha);
2778 auto OldChiSquared =
reinterpret_cast<CHISTRUCT *
>(
2781 if (OldChiSquared ==
nullptr) {
2782 OldChiSquared =
new CHISTRUCT(DegreesOfFreedom, Alpha);
2783 OldChiSquared->ChiSquared =
2784 Solve(ChiArea, OldChiSquared,
static_cast<double>(DegreesOfFreedom),
CHIACCURACY);
2785 ChiWith[DegreesOfFreedom] =
push(ChiWith[DegreesOfFreedom], OldChiSquared);
2790 return (OldChiSquared->ChiSquared);
2807static double NormalDensity(int32_t x) {
2810 Distance = x - kNormalMean;
2811 return kNormalMagnitude * exp(-0.5 * Distance * Distance / kNormalVariance);
2821static double UniformDensity(int32_t x) {
2825 return UniformDistributionDensity;
2839static double Integral(
double f1,
double f2,
double Dx) {
2840 return (f1 + f2) * Dx / 2.0;
2863static void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uint16_t Dim, PARAM_DESC *ParamDesc,
2864 float Mean,
float StdDev) {
2871 for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2872 Buckets->Count[i] = 0;
2875 if (StdDev == 0.0) {
2884 while ((Sample =
NextSample(&SearchState)) !=
nullptr) {
2885 if (Sample->Mean[Dim] >
Mean) {
2886 BucketID = Buckets->NumberOfBuckets - 1;
2887 }
else if (Sample->Mean[Dim] <
Mean) {
2892 Buckets->Count[BucketID] += 1;
2894 if (i >= Buckets->NumberOfBuckets) {
2901 while ((Sample =
NextSample(&SearchState)) !=
nullptr) {
2902 switch (Buckets->Distribution) {
2904 BucketID = NormalBucket(ParamDesc, Sample->Mean[Dim],
Mean, StdDev);
2908 BucketID = UniformBucket(ParamDesc, Sample->Mean[Dim],
Mean, StdDev);
2913 Buckets->Count[Buckets->Bucket[BucketID]] += 1;
2929static uint16_t NormalBucket(PARAM_DESC *ParamDesc,
float x,
float Mean,
float StdDev) {
2933 if (ParamDesc->Circular) {
2934 if (x -
Mean > ParamDesc->HalfRange) {
2935 x -= ParamDesc->Range;
2936 }
else if (x - Mean < -ParamDesc->HalfRange) {
2937 x += ParamDesc->Range;
2941 X = ((x -
Mean) / StdDev) * kNormalStdDev + kNormalMean;
2948 return static_cast<uint16_t
>(floor(
static_cast<double>(X)));
2962static uint16_t UniformBucket(PARAM_DESC *ParamDesc,
float x,
float Mean,
float StdDev) {
2966 if (ParamDesc->Circular) {
2967 if (x -
Mean > ParamDesc->HalfRange) {
2968 x -= ParamDesc->Range;
2969 }
else if (x - Mean < -ParamDesc->HalfRange) {
2970 x += ParamDesc->Range;
2981 return static_cast<uint16_t
>(floor(
static_cast<double>(X)));
2994static bool DistributionOK(BUCKETS *Buckets) {
2995 float FrequencyDifference;
2996 float TotalDifference;
3000 TotalDifference = 0.0;
3001 for (i = 0; i < Buckets->NumberOfBuckets; i++) {
3002 FrequencyDifference = Buckets->Count[i] - Buckets->ExpectedCount[i];
3003 TotalDifference += (FrequencyDifference * FrequencyDifference) / Buckets->ExpectedCount[i];
3007 if (TotalDifference > Buckets->ChiSquared) {
3026static uint16_t DegreesOfFreedom(
DISTRIBUTION Distribution, uint16_t HistogramBuckets) {
3027 static uint8_t DegreeOffsets[] = {3, 3, 1};
3029 uint16_t AdjustedNumBuckets;
3031 AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[
static_cast<int>(Distribution)];
3032 if (
Odd(AdjustedNumBuckets)) {
3033 AdjustedNumBuckets++;
3035 return (AdjustedNumBuckets);
3046static void AdjustBuckets(BUCKETS *Buckets, uint32_t NewSampleCount) {
3048 double AdjustFactor;
3051 ((
static_cast<double>(NewSampleCount)) / (
static_cast<double>(Buckets->SampleCount)));
3053 for (i = 0; i < Buckets->NumberOfBuckets; i++) {
3054 Buckets->ExpectedCount[i] *= AdjustFactor;
3057 Buckets->SampleCount = NewSampleCount;
3066static void InitBuckets(BUCKETS *Buckets) {
3069 for (i = 0; i < Buckets->NumberOfBuckets; i++) {
3070 Buckets->Count[i] = 0;
3087static int AlphaMatch(
void *arg1,
3089 auto *ChiStruct =
static_cast<CHISTRUCT *
>(arg1);
3090 auto *SearchKey =
static_cast<CHISTRUCT *
>(arg2);
3092 return (ChiStruct->Alpha == SearchKey->Alpha);
3109static double Solve(
SOLVEFUNC Function,
void *FunctionParams,
double InitialGuess,
double Accuracy)
3110#define INITIALDELTA 0.1
3111#define DELTARATIO 0.1
3119 double LastPosX, LastNegX;
3124 LastNegX = -FLT_MAX;
3125 f = (*Function)(
static_cast<CHISTRUCT *
>(FunctionParams), x);
3126 while (
Abs(LastPosX - LastNegX) > Accuracy) {
3135 Slope = ((*Function)(
static_cast<CHISTRUCT *
>(FunctionParams), x + Delta) - f) / Delta;
3144 if (NewDelta < Delta) {
3149 f = (*Function)(
static_cast<CHISTRUCT *
>(FunctionParams), x);
3173static double ChiArea(CHISTRUCT *ChiParams,
double x) {
3179 N = ChiParams->DegreesOfFreedom / 2 - 1;
3183 for (i = 1; i <= N; i++) {
3184 Denominator *= 2 * i;
3186 SeriesTotal += PowerOfx / Denominator;
3188 return ((SeriesTotal * exp(-0.5 * x)) - ChiParams->Alpha);
3215static bool MultipleCharSamples(CLUSTERER *Clusterer, CLUSTER *Cluster,
float MaxIllegal)
3216#define ILLEGAL_CHAR 2
3218 static std::vector<uint8_t> CharFlags;
3222 int32_t NumCharInCluster;
3223 int32_t NumIllegalInCluster;
3224 float PercentIllegal;
3228 NumIllegalInCluster = 0;
3230 if (Clusterer->NumChar > CharFlags.size()) {
3231 CharFlags.resize(Clusterer->NumChar);
3234 for (
auto &CharFlag : CharFlags) {
3240 while ((Sample =
NextSample(&SearchState)) !=
nullptr) {
3241 CharID = Sample->CharID;
3242 if (CharFlags[CharID] ==
false) {
3243 CharFlags[CharID] =
true;
3245 if (CharFlags[CharID] ==
true) {
3246 NumIllegalInCluster++;
3250 PercentIllegal =
static_cast<float>(NumIllegalInCluster) / NumCharInCluster;
3251 if (PercentIllegal > MaxIllegal) {
3266static double InvertMatrix(
const float *input,
int size,
float *inv) {
3268 GENERIC_2D_ARRAY<double> U(size, size, 0.0);
3269 GENERIC_2D_ARRAY<double> U_inv(size, size, 0.0);
3270 GENERIC_2D_ARRAY<double> L(size, size, 0.0);
3275 for (row = 0; row < size; row++) {
3276 for (col = 0; col < size; col++) {
3277 U[row][col] = input[row * size + col];
3278 L[row][col] = row == col ? 1.0 : 0.0;
3279 U_inv[row][col] = 0.0;
3284 for (col = 0; col < size; ++col) {
3287 double best_pivot = -1.0;
3288 for (row = col; row < size; ++row) {
3289 if (
Abs(U[row][col]) > best_pivot) {
3290 best_pivot =
Abs(U[row][col]);
3295 if (best_row != col) {
3296 for (
int k = 0; k < size; ++k) {
3297 double tmp = U[best_row][k];
3298 U[best_row][k] = U[col][k];
3300 tmp = L[best_row][k];
3301 L[best_row][k] = L[col][k];
3306 for (row = col + 1; row < size; ++row) {
3307 double ratio = -U[row][col] / U[col][col];
3308 for (
int j = col; j < size; ++j) {
3309 U[row][j] += U[col][j] * ratio;
3311 for (
int k = 0; k < size; ++k) {
3312 L[row][k] += L[col][k] * ratio;
3317 for (col = 0; col < size; ++col) {
3318 U_inv[col][col] = 1.0 / U[col][col];
3319 for (row = col - 1; row >= 0; --row) {
3321 for (
int k = col; k > row; --k) {
3322 total += U[row][k] * U_inv[k][col];
3324 U_inv[row][col] = -total / U[row][row];
3328 for (row = 0; row < size; row++) {
3329 for (col = 0; col < size; col++) {
3331 for (
int k = row; k < size; ++k) {
3332 sum += U_inv[row][k] * L[k][col];
3334 inv[row * size + col] = sum;
3338 double error_sum = 0.0;
3339 for (row = 0; row < size; row++) {
3340 for (col = 0; col < size; col++) {
3342 for (
int k = 0; k < size; ++k) {
3343 sum +=
static_cast<double>(input[row * size + k]) * inv[k * size + col];
3346 error_sum +=
Abs(sum);
#define InitSampleSearch(S, C)
#define MAXDEGREESOFFREEDOM
void KDDelete(KDTREE *Tree, float Key[], void *Data)
void KDStore(KDTREE *Tree, float *Key, void *Data)
void tprintf(const char *format,...)
int32_t MergeClusters(int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[], float m1[], float m2[])
float Mean(PROTOTYPE *Proto, uint16_t Dimension)
tesseract::GenericHeap< ClusterPair > ClusterHeap
CLUSTER * NextSample(LIST *SearchState)
double(*)(int32_t) DENSITYFUNC
tesseract::KDPairInc< float, TEMPCLUSTER * > ClusterPair
KDTREE * MakeKDTree(int16_t KeySize, const PARAM_DESC KeyDesc[])
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
double(*)(CHISTRUCT *, double) SOLVEFUNC
const double FTable[FTABLE_Y][FTABLE_X]
void KDWalk(KDTREE *Tree, void_proc action, void *context)
void FreeProtoList(LIST *ProtoList)
void FreePrototype(void *arg)
CLUSTERER * MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[])
float StandardDeviation(PROTOTYPE *Proto, uint16_t Dimension)
void FreeClusterer(CLUSTERER *Clusterer)
LIST search(LIST list, void *key, int_compare is_equal)
void KDNearestNeighborSearch(KDTREE *Tree, float Query[], int QuerySize, float MaxDistance, int *NumberOfResults, void **NBuffer, float DBuffer[])
LIST push(LIST list, void *element)
void destroy_nodes(LIST list, void_dest destructor)
SAMPLE * MakeSample(CLUSTERER *Clusterer, const float *Feature, uint32_t CharID)
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
std::vector< float > CoVariance
uint16_t Bucket[BUCKETTABLESIZE]
DISTRIBUTION Distribution
std::vector< uint32_t > Count
std::vector< float > ExpectedCount
uint16_t DegreesOfFreedom
CHISTRUCT(uint16_t degreesOfFreedom, double alpha)
std::vector< float > Mean
std::vector< DISTRIBUTION > Distrib
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1 - MINBUCKETS]