PostGIS 3.0.6dev-r@@SVN_REVISION@@
Loading...
Searching...
No Matches
lwkmeans.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * Copyright (c) 2018, Darafei Praliaskouski <me@komzpa.net>
4 * Copyright (c) 2016, Paul Ramsey <pramsey@cleverelephant.ca>
5 *
6 *------------------------------------------------------------------------*/
7
9
10/*
11 * When clustering lists with NULL or EMPTY elements, they will get this as
12 * their cluster number. (All the other clusters will be non-negative)
13 */
14#define KMEANS_NULL_CLUSTER -1
15
16/*
17 * If the algorithm doesn't converge within this number of iterations,
18 * it will return with a failure error code.
19 */
20#define KMEANS_MAX_ITERATIONS 1000
21
22static void
23update_r(POINT2D** objs, int* clusters, uint32_t n, POINT2D** centers, uint32_t k)
24{
25 POINT2D* obj;
26 unsigned int i;
27 double distance, curr_distance;
28 uint32_t cluster, curr_cluster;
29
30 for (i = 0; i < n; i++)
31 {
32 obj = objs[i];
33
34 /* Don't try to cluster NULL objects, just add them to the "unclusterable cluster" */
35 if (!obj)
36 {
37 clusters[i] = KMEANS_NULL_CLUSTER;
38 continue;
39 }
40
41 /* Initialize with distance to first cluster */
42 curr_distance = distance2d_sqr_pt_pt(obj, centers[0]);
43 curr_cluster = 0;
44
45 /* Check all other cluster centers and find the nearest */
46 for (cluster = 1; cluster < k; cluster++)
47 {
48 distance = distance2d_sqr_pt_pt(obj, centers[cluster]);
49 if (distance < curr_distance)
50 {
51 curr_distance = distance;
52 curr_cluster = cluster;
53 }
54 }
55
56 /* Store the nearest cluster this object is in */
57 clusters[i] = (int) curr_cluster;
58 }
59}
60
61static void
62update_means(POINT2D** objs, int* clusters, uint32_t n, POINT2D** centers, uint32_t* weights, uint32_t k)
63{
64 uint32_t i;
65 int cluster;
66
67 memset(weights, 0, sizeof(uint32_t) * k);
68 for (i = 0; i < k; i++)
69 {
70 centers[i]->x = 0.0;
71 centers[i]->y = 0.0;
72 }
73 for (i = 0; i < n; i++)
74 {
75 cluster = clusters[i];
76 if (cluster != KMEANS_NULL_CLUSTER)
77 {
78 centers[cluster]->x += objs[i]->x;
79 centers[cluster]->y += objs[i]->y;
80 weights[cluster] += 1;
81 }
82 }
83 for (i = 0; i < k; i++)
84 {
85 if (weights[i])
86 {
87 centers[i]->x /= weights[i];
88 centers[i]->y /= weights[i];
89 }
90 }
91}
92
93static int
94kmeans(POINT2D** objs, int* clusters, uint32_t n, POINT2D** centers, uint32_t k)
95{
96 uint32_t i = 0;
97 int* clusters_last;
98 int converged = LW_FALSE;
99 size_t clusters_sz = sizeof(int) * n;
100 uint32_t* weights;
101
102 weights = lwalloc(sizeof(int) * k);
103
104 /* previous cluster state array */
105 clusters_last = lwalloc(clusters_sz);
106
107 for (i = 0; i < KMEANS_MAX_ITERATIONS && !converged; i++)
108 {
109 LW_ON_INTERRUPT(break);
110
111 /* store the previous state of the clustering */
112 memcpy(clusters_last, clusters, clusters_sz);
113
114 update_r(objs, clusters, n, centers, k);
115 update_means(objs, clusters, n, centers, weights, k);
116
117 /* if all the cluster numbers are unchanged, we are at a stable solution */
118 converged = memcmp(clusters_last, clusters, clusters_sz) == 0;
119 }
120
121 lwfree(clusters_last);
122 lwfree(weights);
123 if (!converged)
124 lwerror("%s did not converge after %d iterations", __func__, i);
125 return converged;
126}
127
128static void
129kmeans_init(POINT2D **objs, uint32_t n, POINT2D **centers, POINT2D *centers_raw, uint32_t k)
130{
131 double* distances;
132 uint32_t p1 = 0, p2 = 0;
133 uint32_t i, j;
134 uint32_t duplicate_count = 1; /* a point is a duplicate of itself */
135 double max_dst = -1, current_distance;
136 double dst_p1, dst_p2;
137
138 /* k=0, k=1: "clustering" is just input validation */
139 assert(k > 1);
140
141 /* k >= 2: find two distant points greedily */
142 for (i = 1; i < n; i++)
143 {
144 /* skip null */
145 if (!objs[i]) continue;
146
147 /* reinit if first element happened to be null */
148 if (!objs[p1] && !objs[p2])
149 {
150 p1 = i;
151 p2 = i;
152 continue;
153 }
154
155 /* if we found a larger distance, replace our choice */
156 dst_p1 = distance2d_sqr_pt_pt(objs[i], objs[p1]);
157 dst_p2 = distance2d_sqr_pt_pt(objs[i], objs[p2]);
158 if ((dst_p1 > max_dst) || (dst_p2 > max_dst))
159 {
160 if (dst_p1 > dst_p2)
161 {
162 max_dst = dst_p1;
163 p2 = i;
164 }
165 else
166 {
167 max_dst = dst_p2;
168 p1 = i;
169 }
170 }
171 if ((dst_p1 == 0) || (dst_p2 == 0)) duplicate_count++;
172 }
173 if (duplicate_count > 1)
174 lwnotice(
175 "%s: there are at least %u duplicate inputs, number of output clusters may be less than you requested",
176 __func__,
177 duplicate_count);
178
179 /* by now two points should be found and non-same */
180 assert(p1 != p2 && objs[p1] && objs[p2] && max_dst >= 0);
181
182 /* accept these two points */
183 centers_raw[0] = *((POINT2D *)objs[p1]);
184 centers[0] = &(centers_raw[0]);
185 centers_raw[1] = *((POINT2D *)objs[p2]);
186 centers[1] = &(centers_raw[1]);
187
188 if (k > 2)
189 {
190 /* array of minimum distance to a point from accepted cluster centers */
191 distances = lwalloc(sizeof(double) * n);
192
193 /* initialize array with distance to first object */
194 for (j = 0; j < n; j++)
195 {
196 if (objs[j])
197 distances[j] = distance2d_sqr_pt_pt(objs[j], centers[0]);
198 else
199 distances[j] = -1;
200 }
201 distances[p1] = -1;
202 distances[p2] = -1;
203
204 /* loop i on clusters, skip 0 and 1 as found already */
205 for (i = 2; i < k; i++)
206 {
207 uint32_t candidate_center = 0;
208 double max_distance = -DBL_MAX;
209
210 /* loop j on objs */
211 for (j = 0; j < n; j++)
212 {
213 /* empty objs and accepted clusters are already marked with distance = -1 */
214 if (distances[j] < 0) continue;
215
216 /* update minimal distance with previosuly accepted cluster */
217 current_distance = distance2d_sqr_pt_pt(objs[j], centers[i - 1]);
218 if (current_distance < distances[j])
219 distances[j] = current_distance;
220
221 /* greedily take a point that's farthest from any of accepted clusters */
222 if (distances[j] > max_distance)
223 {
224 candidate_center = j;
225 max_distance = distances[j];
226 }
227 }
228
229 /* Checked earlier by counting entries on input, just in case */
230 assert(max_distance >= 0);
231
232 /* accept candidate to centers */
233 distances[candidate_center] = -1;
234 /* Copy the point coordinates into the initial centers array
235 * Centers array is an array of pointers to points, not an array of points */
236 centers_raw[i] = *((POINT2D *)objs[candidate_center]);
237 centers[i] = &(centers_raw[i]);
238 }
239 lwfree(distances);
240 }
241}
242
243int*
244lwgeom_cluster_2d_kmeans(const LWGEOM** geoms, uint32_t n, uint32_t k)
245{
246 uint32_t i;
247 uint32_t num_centroids = 0;
248 uint32_t num_non_empty = 0;
249 LWGEOM** centroids;
250 POINT2D* centers_raw;
251 const POINT2D* cp;
252 int result = LW_FALSE;
253
254 /* An array of objects to be analyzed.
255 * All NULL values will be returned in the KMEANS_NULL_CLUSTER. */
256 POINT2D** objs;
257
258 /* An array of centers for the algorithm. */
259 POINT2D** centers;
260
261 /* Array to fill in with cluster numbers. */
262 int* clusters;
263
264 assert(k > 0);
265 assert(n > 0);
266 assert(geoms);
267
268 if (n < k)
269 {
270 lwerror("%s: number of geometries is less than the number of clusters requested, not all clusters will get data", __func__);
271 k = n;
272 }
273
274 /* We'll hold the temporary centroid objects here */
275 centroids = lwalloc(sizeof(LWGEOM*) * n);
276 memset(centroids, 0, sizeof(LWGEOM*) * n);
277
278 /* The vector of cluster means. We have to allocate a chunk of memory for these because we'll be mutating them
279 * in the kmeans algorithm */
280 centers_raw = lwalloc(sizeof(POINT2D) * k);
281 memset(centers_raw, 0, sizeof(POINT2D) * k);
282
283 /* K-means configuration setup */
284 objs = lwalloc(sizeof(POINT2D*) * n);
285 clusters = lwalloc(sizeof(int) * n);
286 centers = lwalloc(sizeof(POINT2D*) * k);
287
288 /* Clean the memory */
289 memset(objs, 0, sizeof(POINT2D*) * n);
290 memset(clusters, 0, sizeof(int) * n);
291 memset(centers, 0, sizeof(POINT2D*) * k);
292
293 /* Prepare the list of object pointers for K-means */
294 for (i = 0; i < n; i++)
295 {
296 const LWGEOM* geom = geoms[i];
297 LWPOINT* lwpoint;
298
299 /* Null/empty geometries get a NULL pointer, set earlier with memset */
300 if ((!geom) || lwgeom_is_empty(geom)) continue;
301
302 /* If the input is a point, use its coordinates */
303 /* If its not a point, convert it to one via centroid */
304 if (lwgeom_get_type(geom) != POINTTYPE)
305 {
307 if ((!centroid)) continue;
309 {
311 continue;
312 }
313 centroids[num_centroids++] = centroid;
314 lwpoint = lwgeom_as_lwpoint(centroid);
315 }
316 else
317 lwpoint = lwgeom_as_lwpoint(geom);
318
319 /* Store a pointer to the POINT2D we are interested in */
320 cp = getPoint2d_cp(lwpoint->point, 0);
321 objs[i] = (POINT2D*)cp;
322 num_non_empty++;
323 }
324
325 if (num_non_empty < k)
326 {
327 lwnotice("%s: number of non-empty geometries is less than the number of clusters requested, not all clusters will get data", __func__);
328 k = num_non_empty;
329 }
330
331 if (k > 1)
332 {
333 kmeans_init(objs, n, centers, centers_raw, k);
334 result = kmeans(objs, clusters, n, centers, k);
335 }
336 else
337 {
338 /* k=0: everything is unclusterable
339 * k=1: mark up NULL and non-NULL */
340 for (i = 0; i < n; i++)
341 {
342 if (k == 0 || !objs[i])
343 clusters[i] = KMEANS_NULL_CLUSTER;
344 else
345 clusters[i] = 0;
346 }
347 result = LW_TRUE;
348 }
349
350 /* Before error handling, might as well clean up all the inputs */
351 lwfree(objs);
352 lwfree(centers);
353 lwfree(centers_raw);
354 lwfree(centroids);
355
356 /* Good result */
357 if (result) return clusters;
358
359 /* Bad result, not going to need the answer */
360 lwfree(clusters);
361 return NULL;
362}
#define LW_FALSE
Definition liblwgeom.h:108
void lwgeom_free(LWGEOM *geom)
Definition lwgeom.c:1138
LWGEOM * lwgeom_centroid(const LWGEOM *geom)
#define POINTTYPE
LWTYPE numbers, used internally by PostGIS.
Definition liblwgeom.h:116
void * lwalloc(size_t size)
Definition lwutil.c:227
void lwfree(void *mem)
Definition lwutil.c:242
#define LW_TRUE
Return types for functions with status returns.
Definition liblwgeom.h:107
#define LW_ON_INTERRUPT(x)
void lwerror(const char *fmt,...)
Write a notice out to the error handler.
Definition lwutil.c:190
void lwnotice(const char *fmt,...)
Write a notice out to the notice handler.
Definition lwutil.c:177
static double distance2d_sqr_pt_pt(const POINT2D *p1, const POINT2D *p2)
Definition lwinline.h:35
static uint32_t lwgeom_get_type(const LWGEOM *geom)
Return LWTYPE number.
Definition lwinline.h:135
static LWPOINT * lwgeom_as_lwpoint(const LWGEOM *lwgeom)
Definition lwinline.h:121
static int lwgeom_is_empty(const LWGEOM *geom)
Return true or false depending on whether a geometry is an "empty" geometry (no vertices members)
Definition lwinline.h:193
static const POINT2D * getPoint2d_cp(const POINTARRAY *pa, uint32_t n)
Returns a POINT2D pointer into the POINTARRAY serialized_ptlist, suitable for reading from.
Definition lwinline.h:91
#define KMEANS_MAX_ITERATIONS
Definition lwkmeans.c:20
int * lwgeom_cluster_2d_kmeans(const LWGEOM **geoms, uint32_t n, uint32_t k)
Take a list of LWGEOMs and a number of clusters and return an integer array indicating which cluster ...
Definition lwkmeans.c:244
static void update_r(POINT2D **objs, int *clusters, uint32_t n, POINT2D **centers, uint32_t k)
Definition lwkmeans.c:23
static void kmeans_init(POINT2D **objs, uint32_t n, POINT2D **centers, POINT2D *centers_raw, uint32_t k)
Definition lwkmeans.c:129
#define KMEANS_NULL_CLUSTER
Definition lwkmeans.c:14
static void update_means(POINT2D **objs, int *clusters, uint32_t n, POINT2D **centers, uint32_t *weights, uint32_t k)
Definition lwkmeans.c:62
static int kmeans(POINT2D **objs, int *clusters, uint32_t n, POINT2D **centers, uint32_t k)
Definition lwkmeans.c:94
static double distance(double x1, double y1, double x2, double y2)
Definition lwtree.c:1032
Datum centroid(PG_FUNCTION_ARGS)
POINTARRAY * point
Definition liblwgeom.h:457
double y
Definition liblwgeom.h:376
double x
Definition liblwgeom.h:376