Loading...
Searching...
No Matches
choose_n_farthest_points.h
1/* This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
2 * See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
3 * Author(s): Siargey Kachanovich, Marc Glisse
4 *
5 * Copyright (C) 2016 Inria
6 *
7 * Modification(s):
8 * - 2022/11 Glisse: New *_metric variant
9 * - YYYY/MM Author: Description of the modification
10 */
11
12#ifndef CHOOSE_N_FARTHEST_POINTS_H_
13#define CHOOSE_N_FARTHEST_POINTS_H_
14
15#include <boost/version.hpp>
16#include <boost/range.hpp>
17#include <boost/heap/d_ary_heap.hpp>
18#if BOOST_VERSION >= 108100
19 #include <boost/unordered/unordered_flat_set.hpp>
20#else
21 #include <boost/unordered_set.hpp> // preferably with boost 1.79+ for speed
22#endif
23
24#include <gudhi/Null_output_iterator.h>
25
26#include <iterator>
27#include <vector>
28#include <utility>
29#include <random>
30#include <limits>
31
32namespace Gudhi {
33
34namespace subsampling {
35
39enum : std::size_t {
43 random_starting_point = std::size_t(-1)
44};
45
71template < typename Distance,
72typename Point_range,
73typename PointOutputIterator,
74typename DistanceOutputIterator = Null_output_iterator>
75void choose_n_farthest_points(Distance dist,
76 Point_range const &input_pts,
77 std::size_t final_size,
78 std::size_t starting_point,
79 PointOutputIterator output_it,
80 DistanceOutputIterator dist_it = {}) {
81 std::size_t nb_points = boost::size(input_pts);
82 if (final_size > nb_points)
83 final_size = nb_points;
84
85 // Tests to the limit
86 if (final_size < 1)
87 return;
88
89 if (starting_point == random_starting_point) {
90 // Choose randomly the first landmark
91 std::random_device rd;
92 std::mt19937 gen(rd());
93 std::uniform_int_distribution<std::size_t> dis(0, nb_points - 1);
94 starting_point = dis(gen);
95 }
96
97 // FIXME: don't hard-code the type as double. For Epeck_d, we also want to handle types that do not have an infinity.
98 typedef double FT;
99 static_assert(std::numeric_limits<FT>::has_infinity, "the number type needs to support infinity()");
100
101 *output_it++ = input_pts[starting_point];
102 *dist_it++ = std::numeric_limits<FT>::infinity();
103 if (final_size == 1) return;
104
105 std::vector<std::size_t> points(nb_points); // map from remaining points to indexes in input_pts
106 std::vector< FT > dist_to_L(nb_points); // vector of current distances to L from points
107 for(std::size_t i = 0; i < nb_points; ++i) {
108 points[i] = i;
109 dist_to_L[i] = dist(input_pts[i], input_pts[starting_point]);
110 }
111 // The indirection through points makes the program a bit slower. Some alternatives:
112 // - the original code never removed points and counted on them not
113 // reappearing because of a self-distance of 0. This causes unnecessary
114 // computations when final_size is large. It also causes trouble if there are
115 // input points at distance 0 from each other.
116 // - copy input_pts and update the local copy when removing points.
117
118 std::size_t curr_max_w = starting_point;
119
120 for (std::size_t current_number_of_landmarks = 1; current_number_of_landmarks != final_size; current_number_of_landmarks++) {
121 std::size_t latest_landmark = points[curr_max_w];
122 // To remove the latest landmark at index curr_max_w, replace it
123 // with the last point and reduce the length of the vector.
124 std::size_t last = points.size() - 1;
125 if (curr_max_w != last) {
126 points[curr_max_w] = points[last];
127 dist_to_L[curr_max_w] = dist_to_L[last];
128 }
129 points.pop_back();
130
131 // Update distances to L.
132 std::size_t i = 0;
133 for (auto p : points) {
134 FT curr_dist = dist(input_pts[p], input_pts[latest_landmark]);
135 if (curr_dist < dist_to_L[i])
136 dist_to_L[i] = curr_dist;
137 ++i;
138 }
139 // choose the next landmark
140 curr_max_w = 0;
141 FT curr_max_dist = dist_to_L[curr_max_w]; // used for defining the furthest point from L
142 for (i = 1; i < points.size(); i++)
143 if (dist_to_L[i] > curr_max_dist) {
144 curr_max_dist = dist_to_L[i];
145 curr_max_w = i;
146 }
147 *output_it++ = input_pts[points[curr_max_w]];
148 *dist_it++ = dist_to_L[curr_max_w];
149 }
150}
151
152
153// How bad is it to use the triangle inequality with inexact double computations?
154// Hopefully we still get a net-tree.
155
156template<class FT> struct Landmark_info;
157template<class FT>
158struct Compare_landmark_radius {
159 std::vector<Landmark_info<FT>>* landmarks_p;
160 Compare_landmark_radius(std::vector<Landmark_info<FT>>* p): landmarks_p(p) {}
161 bool operator()(std::size_t, std::size_t) const;
162};
163// I compared all the heaps in boost. Fibonacci is not bad, but d_ary is by far the fastest.
164// Arity of 3 is clearly faster than 2, and speed doesn't change much when increasing the arity even more.
165template<class FT>
166using radius_priority_ds =
167 boost::heap::d_ary_heap<std::size_t, boost::heap::arity<7>, boost::heap::compare<Compare_landmark_radius<FT>>,
168 boost::heap::mutable_<true>, boost::heap::constant_time_size<false>>;
169template<class FT>
170struct Landmark_info {
171 std::size_t farthest; FT radius;
172 // The points that are closer to this landmark than to other landmarks
173 std::vector<std::pair<std::size_t, FT>> voronoi;
174 // For a landmark A, the list of landmarks B such that picking a Voronoi
175 // point of A as a new landmark might steal a Voronoi point from B, or vice versa.
176 std::vector<std::pair<std::size_t, FT>> neighbors;
177 // Note that above we cache the distances. This is always good for Voronoi, and for neighbors it is neutral in 2D and helps in 4D.
178 typename radius_priority_ds<FT>::handle_type position_in_queue;
179};
180template<class FT>
181bool Compare_landmark_radius<FT>::operator()(std::size_t a, std::size_t b)const{ return (*landmarks_p)[a].radius < (*landmarks_p)[b].radius; }
182
204template < typename Distance,
205typename Point_range,
206typename PointOutputIterator,
207typename DistanceOutputIterator = Null_output_iterator>
209 Point_range const &input_pts,
210 std::size_t final_size,
211 std::size_t starting_point,
212 PointOutputIterator output_it,
213 DistanceOutputIterator dist_it = {}) {
214 std::size_t nb_points = boost::size(input_pts);
215 if (final_size > nb_points)
216 final_size = nb_points;
217
218 // Tests to the limit
219 if (final_size < 1)
220 return;
221
222 if (starting_point == random_starting_point) {
223 // Choose randomly the first landmark
224 std::random_device rd;
225 std::mt19937 gen(rd());
226 std::uniform_int_distribution<std::size_t> dis(0, nb_points - 1);
227 starting_point = dis(gen);
228 }
229
230 // FIXME: don't hard-code the type as double. For Epeck_d, we also want to handle types that do not have an infinity.
231 typedef double FT;
232 static_assert(std::numeric_limits<FT>::has_infinity, "the number type needs to support infinity()");
233
234 *output_it++ = input_pts[starting_point];
235 *dist_it++ = std::numeric_limits<FT>::infinity();
236 if (final_size == 1) return;
237
238 auto dist = [&](std::size_t a, std::size_t b){ return dist_(input_pts[a], input_pts[b]); };
239
240 std::vector<Landmark_info<FT>> landmarks(nb_points);
241 radius_priority_ds<FT> radius_priority(&landmarks);
242
243 auto compute_radius = [&](std::size_t i)
244 {
245 FT r = -std::numeric_limits<FT>::infinity(); // -2 * diameter should suffice
246 std::size_t jmax = -1;
247 for(auto [ j, d ] : landmarks[i].voronoi) {
248 if (d > r) {
249 r = d;
250 jmax = j;
251 }
252 }
253 landmarks[i].radius = r;
254 landmarks[i].farthest = jmax;
255 };
256 auto update_radius = [&](std::size_t i)
257 {
258 compute_radius(i);
259 radius_priority.decrease(landmarks[i].position_in_queue);
260 };
261
262 {
263 // Initialize everything with starting_point
264 auto& ini = landmarks[starting_point];
265 ini.voronoi.reserve(nb_points - 1);
266 for (std::size_t i = 0; i < nb_points; ++i)
267 if (i != starting_point)
268 ini.voronoi.emplace_back(i, dist(starting_point, i));
269 compute_radius(starting_point);
270 ini.position_in_queue = radius_priority.push(starting_point);
271 }
272 // outside the loop to recycle the allocation
273 std::vector<std::size_t> modified_neighbors;
274#if BOOST_VERSION >= 108100
275 boost::unordered_flat_set<std::size_t>
276#else
277 boost::unordered_set<std::size_t>
278#endif
279 l_neighbors; // Should we use an allocator?
280 for (std::size_t current_number_of_landmarks = 1; current_number_of_landmarks != final_size; current_number_of_landmarks++) {
281 std::size_t l_parent = radius_priority.top();
282 auto& parent_info = landmarks[l_parent];
283 std::size_t l = parent_info.farthest;
284 FT radius = parent_info.radius;
285 auto& info = landmarks[l];
286 *output_it++ = input_pts[l];
287 *dist_it++ = radius;
288 l_neighbors.clear();
289 modified_neighbors.clear();
290 // If a Voronoi point X of A can steal a Voronoi point Y from B, then
291 // BY > XY >= AB - AX - BY, so AB < AX + 2 * BY. Symmetrized.
292 auto max_dist = [](FT a, FT b){ return a + b + std::max(a, b); }; // tighter than 3 * radius
293 // Check if any Voronoi points from ngb need to move to l
294 auto handle_neighbor_voronoi = [&](std::size_t ngb)
295 {
296 auto& ngb_info = landmarks[ngb];
297 auto it = std::remove_if(ngb_info.voronoi.begin(), ngb_info.voronoi.end(), [&](auto wd)
298 {
299 std::size_t w = wd.first;
300 FT d = wd.second;
301 FT newd = dist(l, w);
302 if (newd < d) {
303 if (w != l) // w==l can only happen for ngb==l_parent
304 info.voronoi.emplace_back(w, newd);
305 return true;
306 }
307 return false;
308 });
309 if (it != ngb_info.voronoi.end()) { // modified, always true for ngb==l_parent
310 ngb_info.voronoi.erase(it, ngb_info.voronoi.end());
311 modified_neighbors.push_back(ngb);
312 // We only need to recompute the radius if farthest was removed, which we can test here with
313 // if (dist(l, ngb_info.farthest) < ngb_info.radius)
314 // to avoid a costly test for each w in the loop above, but it does not seem to help.
315 update_radius(ngb);
316 // if (ngb_info.voronoi.empty()) radius_priority.erase(ngb_info.position_in_queue);
317 }
318 // We could easily return true/false here to say whether anything was modified, if useful.
319 };
320 auto handle_neighbor_neighbors = [&](std::size_t ngb)
321 {
322 auto& ngb_info = landmarks[ngb];
323 auto it = std::remove_if(ngb_info.neighbors.begin(), ngb_info.neighbors.end(), [&](auto near_){
324 std::size_t near = near_.first;
325 FT d = near_.second;
326 // Conservative 3 * radius: we could use the old radii of ngb and near, but not the new ones.
327 if (d <= 3 * radius) { l_neighbors.insert(near); }
328 // Here it is safe to use the new radii.
329 return d >= max_dist(ngb_info.radius, landmarks[near].radius);
330 });
331 ngb_info.neighbors.erase(it, ngb_info.neighbors.end());
332 };
333 // First update the Voronoi diagram, so we can compute all the updated
334 // radii before pruning neighbor lists. The main drawback is that we have
335 // to store modified_neighbors, and we don't have access to the old radii
336 // in handle_neighbor_neighbors.
337 handle_neighbor_voronoi(l_parent);
338 // Should we make this loop a remove_if? We already remove in the next loop.
339 for (auto ngb_ : parent_info.neighbors) {
340 std::size_t ngb = ngb_.first;
341 //if(ngb_.second <= max_dist(radius, landmarks[ngb].radius)) // radius from before update_radius(l_parent)
342 //if(ngb_.second <= radius + 2 * landmarks[ngb].radius) // no need to symmetrize
343 // If X can steal a Voronoi point Y from B, then BY > XY >= BX - BY, so BX < 2 * BY.
344 if(dist(l, ngb) < 2 * landmarks[ngb].radius)
345 handle_neighbor_voronoi(ngb);
346 }
347 // If there are too many neighbors (of neighbors), this could be quadratic (?).
348 // Testing every landmark would then be faster, linear.
349 // TODO: find a good heuristic to switch to the linear case.
350 for (std::size_t ngb : modified_neighbors)
351 handle_neighbor_neighbors(ngb);
352 // Finalize the new Voronoi cell.
353 compute_radius(l);
354 info.position_in_queue = radius_priority.push(l);
355 // The parent is an obvious candidate to be a neighbor.
356 l_neighbors.insert(l_parent);
357 for (std::size_t ngb : l_neighbors) {
358 FT d = dist(l, ngb);
359 if (d < max_dist(info.radius, landmarks[ngb].radius)) {
360 info.neighbors.emplace_back(ngb, d);
361 landmarks[ngb].neighbors.emplace_back(l, d);
362 }
363 }
364 }
365}
366
367} // namespace subsampling
368
369} // namespace Gudhi
370
371#endif // CHOOSE_N_FARTHEST_POINTS_H_
void choose_n_farthest_points(Distance dist, Point_range const &input_pts, std::size_t final_size, std::size_t starting_point, PointOutputIterator output_it, DistanceOutputIterator dist_it={})
Subsample by an iterative, greedy strategy.
Definition choose_n_farthest_points.h:75
void choose_n_farthest_points_metric(Distance dist_, Point_range const &input_pts, std::size_t final_size, std::size_t starting_point, PointOutputIterator output_it, DistanceOutputIterator dist_it={})
Subsample by an iterative, greedy strategy.
Definition choose_n_farthest_points.h:208
@ random_starting_point
Definition choose_n_farthest_points.h:43
Gudhi namespace.
Definition SimplicialComplexForAlpha.h:14
Definition Null_output_iterator.h:19