IsoSpec  2.2.1
marginalTrek++.cpp
1 /*
2  * Copyright (C) 2015-2020 Mateusz Łącki and Michał Startek.
3  *
4  * This file is part of IsoSpec.
5  *
6  * IsoSpec is free software: you can redistribute it and/or modify
7  * it under the terms of the Simplified ("2-clause") BSD licence.
8  *
9  * IsoSpec is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12  *
13  * You should have received a copy of the Simplified BSD Licence
14  * along with IsoSpec. If not, see <https://opensource.org/licenses/BSD-2-Clause>.
15  */
16 
17 
18 #include <cmath>
19 #include <algorithm>
20 #include <vector>
21 #include <cstdlib>
22 #include <queue>
23 #include <utility>
24 #include <cstring>
25 #include <string>
26 #include <limits>
27 #include <memory>
28 #include "platform.h"
29 #include "marginalTrek++.h"
30 #include "conf.h"
31 #include "allocator.h"
32 #include "operators.h"
33 #include "summator.h"
34 #include "element_tables.h"
35 #include "misc.h"
36 
37 
38 namespace IsoSpec
39 {
40 
42 
50 void writeInitialConfiguration(const int atomCnt, const int isotopeNo, const double* lprobs, int* res)
51 {
57  // This approximates the mode (heuristics: the mean is close to the mode).
58  for(int i = 0; i < isotopeNo; ++i)
59  res[i] = static_cast<int>( atomCnt * exp(lprobs[i]) ) + 1;
60 
61  // The number of assigned atoms above.
62  int s = 0;
63 
64  for(int i = 0; i < isotopeNo; ++i) s += res[i];
65 
66  int diff = atomCnt - s;
67 
68  // Too little: enlarging fist index.
69  if( diff > 0 ){
70  res[0] += diff;
71  }
72  // Too much: trying to redistribute the difference: hopefully the first element is the largest.
73  if( diff < 0 ){
74  diff = abs(diff);
75  int i = 0;
76 
77  while( diff > 0){
78  int coordDiff = res[i] - diff;
79 
80  if( coordDiff >= 0 ){
81  res[i] -= diff;
82  diff = 0;
83  } else {
84  res[i] = 0;
85  i++;
86  diff = abs(coordDiff);
87  }
88  }
89  }
90 
91  // What we computed so far will be very close to the mode: hillclimb the rest of the way
92 
93  bool modified = true;
94  double LP = unnormalized_logProb(res, lprobs, isotopeNo);
95  double NLP;
96 
97  while(modified)
98  {
99  modified = false;
100  for(int ii = 0; ii < isotopeNo; ii++)
101  for(int jj = 0; jj < isotopeNo; jj++)
102  if(ii != jj && res[ii] > 0)
103  {
104  res[ii]--;
105  res[jj]++;
106  NLP = unnormalized_logProb(res, lprobs, isotopeNo);
107  if(NLP > LP || (NLP == LP && ii > jj))
108  {
109  modified = true;
110  LP = NLP;
111  }
112  else
113  {
114  res[ii]++;
115  res[jj]--;
116  }
117  }
118  }
119 }
120 
121 
122 double* getMLogProbs(const double* probs, int isoNo)
123 {
129  for(int ii = 0; ii < isoNo; ii++)
130  if(probs[ii] <= 0.0 || probs[ii] > 1.0)
131  throw std::invalid_argument("All isotope probabilities p must fulfill: 0.0 < p <= 1.0");
132 
133  double* ret = new double[isoNo];
134 
135  // here we change the table of probabilities and log it.
136  for(int i = 0; i < isoNo; i++)
137  {
138  ret[i] = log(probs[i]);
139  for(int j = 0; j < ISOSPEC_NUMBER_OF_ISOTOPIC_ENTRIES; j++)
140  if(elem_table_probability[j] == probs[i])
141  {
142  ret[i] = elem_table_log_probability[j];
143  break;
144  }
145  }
146  return ret;
147 }
148 
149 double get_loggamma_nominator(int x)
150 {
151  // calculate log gamma of the nominator calculated in the binomial exression.
152  double ret = lgamma(x+1);
153  return ret;
154 }
155 
156 int verify_atom_cnt(int atomCnt)
157 {
158  #if !ISOSPEC_BUILDING_OPENMS
159  if(ISOSPEC_G_FACT_TABLE_SIZE-1 <= atomCnt)
160  throw std::length_error("Subisotopologue too large, size limit (that is, the maximum number of atoms of a single element in a molecule) is: " + std::to_string(ISOSPEC_G_FACT_TABLE_SIZE-1));
161  #endif
162  return atomCnt;
163 }
164 
166  const double* _masses,
167  const double* _probs,
168  int _isotopeNo,
169  int _atomCnt
170 ) :
171 disowned(false),
172 isotopeNo(_isotopeNo),
173 atomCnt(verify_atom_cnt(_atomCnt)),
174 atom_lProbs(getMLogProbs(_probs, isotopeNo)),
175 atom_masses(array_copy<double>(_masses, _isotopeNo)),
176 loggamma_nominator(get_loggamma_nominator(_atomCnt)),
177 mode_conf(nullptr)
178 // Deliberately not initializing mode_lprob
179 {}
180 
182 disowned(false),
183 isotopeNo(other.isotopeNo),
184 atomCnt(other.atomCnt),
185 atom_lProbs(array_copy<double>(other.atom_lProbs, isotopeNo)),
186 atom_masses(array_copy<double>(other.atom_masses, isotopeNo)),
187 loggamma_nominator(other.loggamma_nominator)
188 {
189  if(other.mode_conf == nullptr)
190  {
191  mode_conf = nullptr;
192  // Deliberately not initializing mode_lprob. In this state other.mode_lprob is uninitialized too.
193  }
194  else
195  {
196  mode_conf = array_copy<int>(other.mode_conf, isotopeNo);
197  mode_lprob = other.mode_lprob;
198  }
199 }
200 
201 
202 // the move-constructor: used in the specialization of the marginal.
204 disowned(other.disowned),
205 isotopeNo(other.isotopeNo),
206 atomCnt(other.atomCnt),
207 atom_lProbs(other.atom_lProbs),
208 atom_masses(other.atom_masses),
209 loggamma_nominator(other.loggamma_nominator)
210 {
211  other.disowned = true;
212  if(other.mode_conf == nullptr)
213  {
214  mode_conf = nullptr;
215  // Deliberately not initializing mode_lprob. In this state other.mode_lprob is uninitialized too.
216  }
217  else
218  {
219  mode_conf = other.mode_conf;
220  mode_lprob = other.mode_lprob;
221  }
222 }
223 
225 {
226  if(!disowned)
227  {
228  delete[] atom_masses;
229  delete[] atom_lProbs;
230  delete[] mode_conf;
231  }
232 }
233 
234 
236 {
237  Conf res = new int[isotopeNo];
239  return res;
240 }
241 
242 void Marginal::setupMode()
243 {
244  ISOSPEC_IMPOSSIBLE(mode_conf != nullptr);
246  mode_lprob = logProb(mode_conf);
247 }
248 
249 
251 {
252  double ret_mass = std::numeric_limits<double>::infinity();
253  for(unsigned int ii = 0; ii < isotopeNo; ii++)
254  if( ret_mass > atom_masses[ii] )
255  ret_mass = atom_masses[ii];
256  return ret_mass*atomCnt;
257 }
258 
260 {
261  double ret_mass = 0.0;
262  for(unsigned int ii = 0; ii < isotopeNo; ii++)
263  if( ret_mass < atom_masses[ii] )
264  ret_mass = atom_masses[ii];
265  return ret_mass*atomCnt;
266 }
267 
269 {
270  double found_prob = -std::numeric_limits<double>::infinity();
271  double found_mass = 0.0; // to avoid uninitialized var warning
272  for(unsigned int ii = 0; ii < isotopeNo; ii++)
273  if( found_prob < atom_lProbs[ii] )
274  {
275  found_prob = atom_lProbs[ii];
276  found_mass = atom_masses[ii];
277  }
278  return found_mass*atomCnt;
279 }
280 
282 {
283  double ret = 0.0;
284  for(unsigned int ii = 0; ii < isotopeNo; ii++)
285  ret += exp(atom_lProbs[ii]) * atom_masses[ii];
286  return ret;
287 }
288 
289 double Marginal::variance() const
290 {
291  double ret = 0.0;
292  double mean = getAtomAverageMass();
293  for(size_t ii = 0; ii < isotopeNo; ii++)
294  {
295  double msq = atom_masses[ii] - mean;
296  ret += exp(atom_lProbs[ii]) * msq * msq;
297  }
298  return ret * atomCnt;
299 }
300 
301 double Marginal::getLogSizeEstimate(double logEllipsoidRadius) const
302 {
303  if(isotopeNo <= 1)
304  return -std::numeric_limits<double>::infinity();
305 
306  const double i = static_cast<double>(isotopeNo);
307  const double k = i - 1.0;
308  const double n = static_cast<double>(atomCnt);
309 
310  double sum_lprobs = 0.0;
311  for(int jj = 0; jj < i; jj++)
312  sum_lprobs += atom_lProbs[jj];
313 
314  double log_V_simplex = k * log(n) - lgamma(i);
315  double log_N_simplex = lgamma(n+i) - lgamma(n+1.0) - lgamma(i);
316  double log_V_ellipsoid = (k * (log(n) + logpi + logEllipsoidRadius) + sum_lprobs) * 0.5 - lgamma((i+1)*0.5);
317 
318  return log_N_simplex + log_V_ellipsoid - log_V_simplex;
319 }
320 
321 
322 // this is roughly an equivalent of IsoSpec-Threshold-Generator
324  Marginal&& m,
325  int tabSize,
326  int
327 ) :
328 Marginal(std::move(m)),
329 current_count(0),
330 orderMarginal(atom_lProbs, isotopeNo),
331 pq(),
332 allocator(isotopeNo, tabSize),
333 min_lprob(*std::min_element(atom_lProbs, atom_lProbs+isotopeNo))
334 {
335  int* initialConf = allocator.makeCopy(mode_conf);
336 
337  pq.push({mode_lprob, initialConf});
338 
339  current_count = 0;
340 
341  fringe.resize_and_wipe(1);
342 
343  current_bucket = 0;
344  initialized_until = 1;
345 
346  add_next_conf();
347 }
348 
349 
350 bool MarginalTrek::add_next_conf()
351 {
356  if(pq.empty())
357  {
358  current_bucket++;
359  while(current_bucket < initialized_until && fringe[current_bucket].empty())
360  {
361 // std::cout << "EMPTY bucket, id: " << current_bucket << std::endl;
362  current_bucket++;
363  }
364 
365 // std::cout << "Entering bucket, size: " << fringe[current_bucket].size() << std::endl;
366 
367  if(current_bucket >= initialized_until)
368  return false;
369 
370  // std::cout << "Fringe size at pop: " << fringe[current_bucket].size() << std::endl;
371  pq = std::priority_queue<ProbAndConfPtr, pod_vector<ProbAndConfPtr> >(std::less<ProbAndConfPtr>(), pod_vector<ProbAndConfPtr>(std::move(fringe[current_bucket])));
372  };
373 
374  double logprob = pq.top().first;
375  Conf topConf = pq.top().second;
376 
377  pq.pop();
378  ++current_count;
379 
380  _confs.push_back(topConf);
381 
382  _conf_masses.push_back(calc_mass(topConf, atom_masses, isotopeNo));
383  _conf_lprobs.push_back(logprob);
384 
385  for( unsigned int j = 0; j < isotopeNo; ++j )
386  {
387  if( topConf[j] > mode_conf[j])
388  continue;
389 
390  if( topConf[j] > 0 )
391  {
392  for( unsigned int i = 0; i < isotopeNo; ++i )
393  {
394  if( topConf[i] < mode_conf[i] )
395  continue;
396  // Growing index different than decreasing one AND
397  // Remain on simplex condition.
398  if( i != j ){
399  Conf acceptedCandidate = allocator.makeCopy(topConf);
400 
401  ++acceptedCandidate[i];
402  --acceptedCandidate[j];
403 
404  double new_prob = logProb(acceptedCandidate);
405  size_t bucket_nr = bucket_no(new_prob);
406 
407  if(bucket_nr >= initialized_until)
408  {
409  // std::cout << "Extending to: " << bucket_nr << std::endl;
410  initialized_until = bucket_nr+1;
411  fringe.resize_and_wipe(initialized_until);
412  }
413 
414  ISOSPEC_IMPOSSIBLE(bucket_nr < current_bucket);
415  if(bucket_nr == current_bucket)
416  pq.push({new_prob, acceptedCandidate});
417  else
418  fringe[bucket_nr].push_back({new_prob, acceptedCandidate});
419 
420  }
421 
422  if( topConf[i] > mode_conf[i] )
423  break;
424  }
425  }
426  if( topConf[j] < mode_conf[j] )
427  break;
428  }
429 
430  return true;
431 }
432 
433 
434 MarginalTrek::~MarginalTrek()
435 {
436  const size_t fringe_size = fringe.size();
437  for(size_t ii = 0; ii < fringe_size; ii++)
438  fringe[ii].clear();
439 }
440 
441 
442 
444  double lCutOff,
445  bool sort,
446  int tabSize,
447  int
448 ) : Marginal(std::move(m)),
449 allocator(isotopeNo, tabSize)
450 {
451  Conf currentConf = allocator.makeCopy(mode_conf);
452  if(logProb(currentConf) >= lCutOff)
453  {
454  configurations.push_back(currentConf);
455  lProbs.push_back(mode_lprob);
456  }
457 
458  unsigned int idx = 0;
459 
460  std::unique_ptr<double[]> prob_partials(new double[isotopeNo]);
461  std::unique_ptr<double[]> prob_part_acc(new double[isotopeNo+1]);
462  prob_part_acc[0] = loggamma_nominator;
463 
464  while(idx < configurations.size())
465  {
466  currentConf = configurations[idx];
467  idx++;
468 
469  for(size_t ii = 0; ii < isotopeNo; ii++)
470  prob_partials[ii] = minuslogFactorial(currentConf[ii]) + currentConf[ii] * atom_lProbs[ii];
471 
472  for(unsigned int ii = 0; ii < isotopeNo; ii++ )
473  {
474  if(currentConf[ii] > mode_conf[ii])
475  continue;
476 
477  if(currentConf[ii] != 0)
478  {
479  double prev_partial_ii = prob_partials[ii];
480  currentConf[ii]--;
481  prob_partials[ii] = minuslogFactorial(currentConf[ii]) + currentConf[ii] * atom_lProbs[ii];
482 
483  for(unsigned int jj = 0; jj < isotopeNo; jj++ )
484  {
485  prob_part_acc[jj+1] = prob_part_acc[jj] + prob_partials[jj];
486 
487  if(currentConf[jj] < mode_conf[jj])
488  continue;
489 
490  if( ii != jj )
491  {
492  double logp = prob_part_acc[jj] + minuslogFactorial(1+currentConf[jj]) + (1+currentConf[jj]) * atom_lProbs[jj];
493  for(size_t kk = jj+1; kk < isotopeNo; kk++)
494  logp += prob_partials[kk];
495 
496  if (logp >= lCutOff)
497  {
498  auto tmp = allocator.makeCopy(currentConf);
499  tmp[jj]++;
500  configurations.push_back(tmp);
501  lProbs.push_back(logp);
502  }
503  }
504  else
505  prob_part_acc[jj+1] = prob_part_acc[jj] + prob_partials[jj];
506 
507  if (currentConf[jj] > mode_conf[jj])
508  break;
509  }
510  currentConf[ii]++;
511  prob_partials[ii] = prev_partial_ii;
512  }
513 
514  if(currentConf[ii] < mode_conf[ii])
515  break;
516  }
517  }
518 
519  no_confs = configurations.size();
520  confs = configurations.data();
521 
522  if(sort && no_confs > 0)
523  {
524  std::unique_ptr<size_t[]> order_arr(get_inverse_order(lProbs.data(), no_confs));
525  impose_order(order_arr.get(), no_confs, lProbs.data(), confs);
526  }
527 
528  probs = new double[no_confs];
529  masses = new double[no_confs];
530 
531 
532  for(unsigned int ii = 0; ii < no_confs; ii++)
533  {
534  probs[ii] = exp(lProbs[ii]);
535  masses[ii] = calc_mass(confs[ii], atom_masses, isotopeNo);
536  }
537 
538  lProbs.push_back(-std::numeric_limits<double>::infinity());
539 }
540 
541 
543 {
544  if(masses != nullptr)
545  delete[] masses;
546  if(probs != nullptr)
547  delete[] probs;
548 }
549 
550 
551 
552 
553 
554 
555 
557 : Marginal(std::move(m)), current_threshold(1.0), allocator(isotopeNo, tabSize),
558 equalizer(isotopeNo), keyHasher(isotopeNo)
559 {
560  fringe.push_back(mode_conf);
561  lProbs.push_back(std::numeric_limits<double>::infinity());
562  fringe_unn_lprobs.push_back(unnormalized_logProb(mode_conf));
563  lProbs.push_back(-std::numeric_limits<double>::infinity());
564  guarded_lProbs = lProbs.data()+1;
565 }
566 
567 bool LayeredMarginal::extend(double new_threshold, bool do_sort)
568 {
569  new_threshold -= loggamma_nominator;
570  if(fringe.empty())
571  return false;
572 
573  lProbs.pop_back(); // Remove the +inf guardian
574 
575  pod_vector<Conf> new_fringe;
576  pod_vector<double> new_fringe_unn_lprobs;
577 
578  while(!fringe.empty())
579  {
580  Conf currentConf = fringe.back();
581  fringe.pop_back();
582 
583  double opc = fringe_unn_lprobs.back();
584 
585  fringe_unn_lprobs.pop_back();
586  if(opc < new_threshold)
587  {
588  new_fringe.push_back(currentConf);
589  new_fringe_unn_lprobs.push_back(opc);
590  }
591 
592  else
593  {
594  configurations.push_back(currentConf);
595  lProbs.push_back(opc+loggamma_nominator);
596  for(unsigned int ii = 0; ii < isotopeNo; ii++ )
597  {
598  if(currentConf[ii] > mode_conf[ii])
599  continue;
600 
601  if(currentConf[ii] > 0)
602  {
603  currentConf[ii]--;
604  for(unsigned int jj = 0; jj < isotopeNo; jj++ )
605  {
606  if(currentConf[jj] < mode_conf[jj])
607  continue;
608 
609  if( ii != jj )
610  {
611  Conf nc = allocator.makeCopy(currentConf);
612  nc[jj]++;
613 
614  double lpc = unnormalized_logProb(nc);
615  if(lpc >= new_threshold)
616  {
617  fringe.push_back(nc);
618  fringe_unn_lprobs.push_back(lpc);
619  }
620  else
621  {
622  new_fringe.push_back(nc);
623  new_fringe_unn_lprobs.push_back(lpc);
624  }
625  }
626 
627  if(currentConf[jj] > mode_conf[jj])
628  break;
629  }
630  currentConf[ii]++;
631  }
632 
633  if(currentConf[ii] < mode_conf[ii])
634  break;
635  }
636  }
637  }
638 
639  current_threshold = new_threshold;
640  fringe.swap(new_fringe);
641  fringe_unn_lprobs.swap(new_fringe_unn_lprobs);
642 
643  if(do_sort)
644  {
645  size_t to_sort_size = configurations.size() - probs.size();
646  if(to_sort_size > 0)
647  {
648  std::unique_ptr<size_t[]> order_arr(get_inverse_order(lProbs.data()+1+probs.size(), to_sort_size));
649  double* P = lProbs.data()+1+probs.size();
650  Conf* C = configurations.data()+probs.size();
651  size_t* O = order_arr.get();
652  impose_order(O, to_sort_size, P, C);
653  }
654  }
655 
656  if(probs.capacity() * 2 < configurations.size() + 2)
657  {
658  // Reserve space for new values
659  probs.reserve(configurations.size());
660  masses.reserve(configurations.size());
661  } // Otherwise we're growing slowly enough that standard reallocations on push_back work better - we waste some extra memory
662  // but don't reallocate on every call
663 
664 // printVector(lProbs);
665  for(unsigned int ii = probs.size(); ii < configurations.size(); ii++)
666  {
667  probs.push_back(exp(lProbs[ii+1]));
668  masses.push_back(calc_mass(configurations[ii], atom_masses, isotopeNo));
669  }
670 
671  lProbs.push_back(-std::numeric_limits<double>::infinity()); // Restore guardian
672 
673  guarded_lProbs = lProbs.data()+1; // Vector might have reallocated its backing storage
674 
675  return true;
676 }
677 
678 
680 {
681  double ret = std::numeric_limits<double>::infinity();
682  for(pod_vector<double>::const_iterator it = masses.cbegin(); it != masses.cend(); ++it)
683  if(*it < ret)
684  ret = *it;
685  return ret;
686 }
687 
688 
690 {
691  double ret = -std::numeric_limits<double>::infinity();
692  for(pod_vector<double>::const_iterator it = masses.cbegin(); it != masses.cend(); ++it)
693  if(*it > ret)
694  ret = *it;
695  return ret;
696 }
697 
698 } // namespace IsoSpec
bool extend(double new_threshold, bool do_sort=true)
Extend the set of computed subisotopologues to those above the new threshold.
double get_max_mass() const
Get the maximal mass in current layer.
double get_min_mass() const
Get the minimal mass in current layer.
LayeredMarginal(Marginal &&m, int tabSize=1000, int hashSize=1000)
Move constructor: specializes the Marginal class.
The marginal distribution class (a subisotopologue).
Conf computeModeConf() const
The the probability of the mode subisotopologue.
Marginal(const double *_masses, const double *_probs, int _isotopeNo, int _atomCnt)
Class constructor.
const unsigned int atomCnt
double getLightestConfMass() const
Get the mass of the lightest subisotopologue.
const unsigned int isotopeNo
const double *const atom_masses
double variance() const
Calculate the variance of the theoretical distribution describing the subisotopologue.
ISOSPEC_FORCE_INLINE double unnormalized_logProb(Conf conf) const
Calculate the log-probability of a given subisotopologue.
const double loggamma_nominator
double getHeaviestConfMass() const
Get the mass of the heaviest subisotopologue.
double getAtomAverageMass() const
The average mass of a single atom.
double getMonoisotopicConfMass() const
Get the mass of the monoisotopic subisotopologue.
virtual ~Marginal()
Destructor.
double getLogSizeEstimate(double logEllipsoidRadius) const
Return estimated logarithm of size of the marginal at a given ellipsoid radius.
const double *const atom_lProbs
MarginalTrek(Marginal &&m, int tabSize=1000, int hashSize=1000)
Move constructor: specializes the Marginal class.
virtual ~PrecalculatedMarginal()
Destructor.
PrecalculatedMarginal(Marginal &&m, double lCutOff, bool sort=true, int tabSize=1000, int hashSize=1000)
The move constructor (disowns the Marginal).
double * getMLogProbs(const double *probs, int isoNo)
void writeInitialConfiguration(const int atomCnt, const int isotopeNo, const double *lprobs, int *res)
Find one of the most probable subisotopologues.