RDKit
Open-source cheminformatics and machine learning.
LeaderPicker.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2003-2007 Greg Landrum and Rational Discovery LLC
3 // Copyright (C) 2017-2019 Greg Landrum and NextMove Software
4 //
5 // @@ All Rights Reserved @@
6 // This file is part of the RDKit.
7 // The contents are covered by the terms of the BSD license
8 // which is included in the file license.txt, found at the root
9 // of the RDKit source tree.
10 //
11 #ifndef RD_LEADERPICKER_H
12 #define RD_LEADERPICKER_H
13 
14 #include <RDGeneral/types.h>
15 #include <RDGeneral/utils.h>
16 #include <RDGeneral/Invariant.h>
17 #include <RDGeneral/RDLog.h>
18 #include <RDGeneral/Exceptions.h>
19 #include <cstdlib>
20 #include "DistPicker.h"
21 
22 namespace RDPickers {
23 
24 /*! \brief Implements the Leader algorithm for picking a subset of item from a
25  *pool
26  *
27  * This class inherits from the DistPicker and implements a specific picking
28  *strategy aimed at diversity. See documentation for "pick()" member function
29  *for the algorithm details
30  */
31 class LeaderPicker : public DistPicker {
32  public:
33  double default_threshold{0.0};
35 
36  /*! \brief Default Constructor
37  *
38  */
40  LeaderPicker(double threshold)
41  : default_threshold(threshold), default_nthreads(1) {}
42  LeaderPicker(double threshold, int nthreads)
43  : default_threshold(threshold), default_nthreads(nthreads) {}
44 
45  /*! \brief Contains the implementation for a lazy Leader diversity picker
46  *
47  * See the documentation for the pick() method for details about the algorithm
48  *
49  * \param func - a function (or functor) taking two unsigned ints as
50  *arguments and returning the distance (as a double) between those two
51  *elements.
52  *
53  * \param poolSize - the size of the pool to pick the items from. It is
54  *assumed that the distance matrix above contains the right number of
55  *elements; i.e. poolSize*(poolSize-1)
56  *
57  * \param pickSize - the number items to pick from pool (<= poolSize)
58  *
59  * \param firstPicks - (optional)the first items in the pick list
60  *
61  * \param seed - (optional) seed for the random number generator. If this is
62  *<0 the generator will be seeded with a random number.
63  */
64  template <typename T>
65  RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
66  unsigned int pickSize) const;
67 
68  template <typename T>
69  RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
70  unsigned int pickSize, double threshold) const;
71 
72  template <typename T>
73  RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
74  unsigned int pickSize,
75  const RDKit::INT_VECT &firstPicks,
76  double threshold) const;
77 
78  template <typename T>
79  RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
80  unsigned int pickSize,
81  const RDKit::INT_VECT &firstPicks, double threshold,
82  int nthreads) const;
83 
84  /*! \brief Contains the implementation for the Leader diversity picker
85  *
86  * \param distMat - distance matrix - a vector of double. It is assumed that
87  *only the lower triangle element of the matrix are supplied in a 1D array\n
88  *
89  * \param poolSize - the size of the pool to pick the items from. It is
90  *assumed that the distance matrix above contains the right number of
91  *elements; i.e. poolSize*(poolSize-1) \n
92  *
93  * \param pickSize - maximum number items to pick from pool (<= poolSize)
94  *
95  * \param firstPicks - indices of the items used to seed the pick set.
96  */
97  RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize,
98  unsigned int pickSize, const RDKit::INT_VECT &firstPicks,
99  double threshold, int nthreads) const {
100  CHECK_INVARIANT(distMat, "Invalid Distance Matrix");
101  if (!poolSize) throw ValueErrorException("empty pool to pick from");
102  if (poolSize < pickSize)
103  throw ValueErrorException("pickSize cannot be larger than the poolSize");
104  distmatFunctor functor(distMat);
105  return this->lazyPick(functor, poolSize, pickSize, firstPicks, threshold,
106  nthreads);
107  }
108 
109  /*! \overload */
110  RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize,
111  unsigned int pickSize) const {
112  RDKit::INT_VECT iv;
113  return pick(distMat, poolSize, pickSize, iv, default_threshold,
115  }
116 };
117 
118 #ifdef USE_THREADED_LEADERPICKER
119 // Note that this block of code currently only works on linux (which is why it's
120 // disabled by default) We will revisit this during the 2020.03 release cycle in
121 // order to get a multi-threaded version of the LeaderPicker that works on all
122 // supported platforms
123 template <typename T>
124 void *LeaderPickerWork(void *arg);
125 
126 template <typename T>
127 struct LeaderPickerState {
128  typedef struct {
129  int *ptr;
130  unsigned int capacity;
131  unsigned int len;
132  unsigned int next[2];
133  } LeaderPickerBlock;
134  typedef struct {
135  LeaderPickerState<T> *stat;
136  pthread_t tid;
137  unsigned int id;
138  } LeaderPickerThread;
139 
140  std::vector<LeaderPickerThread> threads;
141  std::vector<LeaderPickerBlock> blocks;
142  pthread_barrier_t wait;
143  pthread_barrier_t done;
144  std::vector<int> v;
145  LeaderPickerBlock *head_block;
146  unsigned int thread_op;
147  unsigned int nthreads;
148  unsigned int tick;
149  double threshold;
150  int query;
151  T *func;
152 
153  LeaderPickerState(unsigned int count, int nt) {
154  v.resize(count);
155  for (unsigned int i = 0; i < count; i++) v[i] = i;
156 
157  // InitializeBlocks
158  unsigned int bcount;
159  unsigned int bsize;
160  if (nt > 1) {
161  bsize = 4096;
162  bcount = (count + (bsize - 1)) / bsize;
163  unsigned int tasks = (bcount + 1) / 2;
164  // limit number of threads to available work
165  if (nt > (int)tasks) nt = tasks;
166  } else {
167  bsize = 32768;
168  bcount = (count + (bsize - 1)) / bsize;
169  }
170  blocks.resize(bcount);
171  head_block = &blocks[0];
172  tick = 0;
173 
174  if (bcount > 1) {
175  int *ptr = &v[0];
176  unsigned int len = count;
177  for (unsigned int i = 0; i < bcount; i++) {
178  LeaderPickerBlock *block = &blocks[i];
179  block->ptr = ptr;
180  if (len > bsize) {
181  block->capacity = bsize;
182  block->len = bsize;
183  block->next[0] = i + 1;
184  } else {
185  block->capacity = len;
186  block->len = len;
187  block->next[0] = 0;
188  break;
189  }
190  ptr += bsize;
191  len -= bsize;
192  }
193  } else {
194  head_block->capacity = count;
195  head_block->len = count;
196  head_block->next[0] = 0;
197  head_block->next[1] = 0;
198  head_block->ptr = &v[0];
199  }
200 
201  // InitializeThreads
202  if (nt > 1) {
203  nthreads = nt;
204  pthread_barrier_init(&wait, NULL, nthreads + 1);
205  pthread_barrier_init(&done, NULL, nthreads + 1);
206 
207  threads.resize(nt);
208  for (unsigned int i = 0; i < nthreads; i++) {
209  threads[i].id = i;
210  threads[i].stat = this;
211  pthread_create(&threads[i].tid, NULL, LeaderPickerWork<T>,
212  (void *)&threads[i]);
213  }
214  } else
215  nthreads = 1;
216  }
217 
218  ~LeaderPickerState() {
219  if (nthreads > 1) {
220  thread_op = 1;
221  pthread_barrier_wait(&wait);
222  for (unsigned int i = 0; i < nthreads; i++)
223  pthread_join(threads[i].tid, 0);
224  pthread_barrier_destroy(&wait);
225  pthread_barrier_destroy(&done);
226  }
227  }
228 
229  bool empty() {
230  while (head_block) {
231  if (head_block->len) return false;
232  unsigned int next_tick = head_block->next[tick];
233  if (!next_tick) return true;
234  head_block = &blocks[next_tick];
235  }
236  return true;
237  }
238 
239  unsigned int compact(int *dst, int *src, unsigned int len) {
240  unsigned int count = 0;
241  for (unsigned int i = 0; i < len; i++) {
242  if ((*func)(query, src[i]) > threshold) dst[count++] = src[i];
243  }
244  return count;
245  }
246 
247  void compact_job(unsigned int cycle) {
248  // On entry, next[tick] for each block is the current linked list.
249  // On exit, next[tock] is the linked list for the next iteration.
250  unsigned int tock = tick ^ 1;
251 
252  LeaderPickerBlock *list = head_block;
253  for (;;) {
254  unsigned int next_tick = list->next[tick];
255  if (next_tick) {
256  LeaderPickerBlock *next = &blocks[next_tick];
257  unsigned int next_next_tick = next->next[tick];
258  if (cycle == 0) {
259  list->len = compact(list->ptr, list->ptr, list->len);
260  if (list->len + next->len <= list->capacity) {
261  list->len += compact(list->ptr + list->len, next->ptr, next->len);
262  list->next[tock] = next_next_tick;
263  } else {
264  next->len = compact(next->ptr, next->ptr, next->len);
265  if (next->len) {
266  list->next[tock] = next_tick;
267  next->next[tock] = next_next_tick;
268  } else
269  list->next[tock] = next_next_tick;
270  }
271  cycle = nthreads - 1;
272  } else
273  cycle--;
274  if (next_next_tick) {
275  list = &blocks[next_next_tick];
276  } else
277  break;
278  } else {
279  if (cycle == 0) {
280  list->len = compact(list->ptr, list->ptr, list->len);
281  list->next[tock] = 0;
282  }
283  break;
284  }
285  }
286  }
287 
288  void compact(int pick) {
289  query = pick;
290  if (nthreads > 1) {
291  thread_op = 0;
292  pthread_barrier_wait(&wait);
293  pthread_barrier_wait(&done);
294  } else
295  compact_job(0);
296  tick ^= 1;
297  }
298 
299  int compact_next() {
300  compact(head_block->ptr[0]);
301  return query;
302  }
303 };
304 
305 // This is the loop the worker threads run
306 template <typename T>
307 void *LeaderPickerWork(void *arg) {
308  typename LeaderPickerState<T>::LeaderPickerThread *thread;
309  thread = (typename LeaderPickerState<T>::LeaderPickerThread *)arg;
310  LeaderPickerState<T> *stat = thread->stat;
311 
312  for (;;) {
313  pthread_barrier_wait(&stat->wait);
314  if (stat->thread_op) return (void *)0;
315  stat->compact_job(thread->id);
316  pthread_barrier_wait(&stat->done);
317  }
318 }
319 #else
320 
321 template <typename T>
323  std::vector<int> v;
324  unsigned int left;
325  double threshold;
326  int query;
327  T *func;
328 
329  LeaderPickerState(unsigned int count, int) : left(count), threshold(0.0), query(0), func(nullptr) {
330  v.resize(count);
331  for (unsigned int i = 0; i < count; i++) v[i] = i;
332  }
333 
334  bool empty() { return left == 0; }
335 
336  unsigned int compact(int *dst, int *src, unsigned int len) {
337  unsigned int count = 0;
338  for (unsigned int i = 0; i < len; i++) {
339  double ld = (*func)(query, src[i]);
340  // std::cerr << query << "-" << src[i] << " " << ld << std::endl;
341  if (ld > threshold) dst[count++] = src[i];
342  }
343  return count;
344  }
345 
346  void compact(int pick) {
347  query = pick;
348  left = compact(&v[0], &v[0], left);
349  }
350 
351  int compact_next() {
352  query = v[0];
353  left = compact(&v[0], &v[1], left - 1);
354  return query;
355  }
356 };
357 
358 #endif
359 // we implement this here in order to allow arbitrary functors without link
360 // errors
361 template <typename T>
362 RDKit::INT_VECT LeaderPicker::lazyPick(T &func, unsigned int poolSize,
363  unsigned int pickSize,
364  const RDKit::INT_VECT &firstPicks,
365  double threshold, int nthreads) const {
366  if (!poolSize) throw ValueErrorException("empty pool to pick from");
367 
368  if (poolSize < pickSize)
369  throw ValueErrorException("pickSize cannot be larger than the poolSize");
370 
371  if (!pickSize) pickSize = poolSize;
372  RDKit::INT_VECT picks;
373 
374  LeaderPickerState<T> stat(poolSize, nthreads);
375  stat.threshold = threshold;
376  stat.func = &func;
377 
378  unsigned int picked = 0; // picks.size()
379  unsigned int pick = 0;
380 
381  if (!firstPicks.empty()) {
382  for (RDKit::INT_VECT::const_iterator pIdx = firstPicks.begin();
383  pIdx != firstPicks.end(); ++pIdx) {
384  pick = static_cast<unsigned int>(*pIdx);
385  if (pick >= poolSize) {
386  throw ValueErrorException("pick index was larger than the poolSize");
387  }
388  picks.push_back(pick);
389  stat.compact(pick);
390  picked++;
391  }
392  }
393 
394  while (picked < pickSize && !stat.empty()) {
395  pick = stat.compact_next();
396  picks.push_back(pick);
397  picked++;
398  }
399  return picks;
400 }
401 
402 template <typename T>
403 RDKit::INT_VECT LeaderPicker::lazyPick(T &func, unsigned int poolSize,
404  unsigned int pickSize) const {
405  RDKit::INT_VECT firstPicks;
406  return LeaderPicker::lazyPick(func, poolSize, pickSize, firstPicks,
408 }
409 
410 template <typename T>
411 RDKit::INT_VECT LeaderPicker::lazyPick(T &func, unsigned int poolSize,
412  unsigned int pickSize,
413  double threshold) const {
414  RDKit::INT_VECT firstPicks;
415  return LeaderPicker::lazyPick(func, poolSize, pickSize, firstPicks, threshold,
417 }
418 template <typename T>
419 RDKit::INT_VECT LeaderPicker::lazyPick(T &func, unsigned int poolSize,
420  unsigned int pickSize,
421  const RDKit::INT_VECT &firstPicks,
422  double threshold) const {
423  return LeaderPicker::lazyPick(func, poolSize, pickSize, firstPicks, threshold,
425 }
426 
427 }; // namespace RDPickers
428 
429 #endif
#define CHECK_INVARIANT(expr, mess)
Definition: Invariant.h:101
Abstract base class to do perform item picking (typically molecules) using a distance matrix.
Definition: DistPicker.h:46
Implements the Leader algorithm for picking a subset of item from a pool.
Definition: LeaderPicker.h:31
RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize, unsigned int pickSize) const
Contains the implementation for a lazy Leader diversity picker.
Definition: LeaderPicker.h:403
RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, unsigned int pickSize, const RDKit::INT_VECT &firstPicks, double threshold, int nthreads) const
Contains the implementation for the Leader diversity picker.
Definition: LeaderPicker.h:97
LeaderPicker()
Default Constructor.
Definition: LeaderPicker.h:39
LeaderPicker(double threshold)
Definition: LeaderPicker.h:40
LeaderPicker(double threshold, int nthreads)
Definition: LeaderPicker.h:42
RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, unsigned int pickSize) const
Definition: LeaderPicker.h:110
Class to allow us to throw a ValueError from C++ and have it make it back to Python.
Definition: Exceptions.h:39
std::vector< int > INT_VECT
Definition: types.h:271
const std::string done
unsigned int compact(int *dst, int *src, unsigned int len)
Definition: LeaderPicker.h:336
LeaderPickerState(unsigned int count, int)
Definition: LeaderPicker.h:329