You can do bigint's or you can do approximations.
I've tried about 8 different implementations in smhasher. (for 32bit to 256bit)
See this massacre:
// Naive multiplication, no accuracy at all
static double ExpectedNBCollisions_Slow ( const double nbH, const double nbBits )
{
long balls = nbH;
long double bins = nbBits;
long double result = 1.0;
for (long i = 1; i < balls / 2; i++) {
// take a pair from the front and the end to minimize errors
result *= ((bins - i) / bins) * ((bins - (nbH - i)) / bins);
}
return (double)(nbH * result);
}
// TODO This only works for a low number of collisions
static inline double ExpectedCollisions ( const double balls, const double bins )
{
return balls - (bins * (1 - pow((bins - 1)/bins, balls)));
}
// Still too inaccurate: https://preshing.com/20110504/hash-collision-probabilities/
static double EstimateNbCollisions_Taylor(const double nbH, const double nbBits)
{
const long double k = nbH;
const long double b = nbBits;
return (double)(k * (1.0 - expl(-0.5 * k * (k - 1.0) / b)));
}
// demerphq: (double(count) * double(count-1)) / pow(2.0,double(sizeof(hashtype) * 8 + 1));
// the very same as our calc. pow 2 vs exp2. Just the high cutoff is missing here.
static double EstimateNbCollisions_Demerphq(const double nbH, const double nbBits)
{
return (nbH * (nbH - 1)) / pow(2.0, nbBits + 1);
}
// GNU R: qbirthday. rough estimate. FIXME
static double EstimateNbCollisions_R(const double nbH, const double nbBits)
{
return ceil(exp(((log(nbH) + lgamma(3) + log(-log1p(-0.5)))) / 2));
}
// GNU R: pbirthday. FIXME
/*
static double EstimateNbCollisions_Rp(const double c)
{
return (1 - prod((c:(c-0.5+1))/rep(2, 0.5)));
}
*/
// The previous best calculation, highly prone to inaccuracies with low results (1.0 - 10.0)
// TODO: return also the error.
static double EstimateNbCollisions_previmpl(const double nbH, const double nbBits)
{
double exp = exp2(nbBits); // 2 ^ bits
double result = (nbH * (nbH-1)) / (2.0 * exp);
if (result > nbH)
result = nbH;
// improved floating point accuracy
if (result <= exp || nbBits > 32)
return result;
return result - exp;
}
static double EstimateNbCollisions_fwojcik(const double nbH, const int nbBits)
{
// If the probability that there are 1 or more collisions (p(C >=
// 1)) is not much higher than the probability of exactly 1
// collision (p(C == 1)), then the classically-good approximation
// of the probability of any collisions is also a good estimate
// for the expected number of collisions.
//
// If there are 2**n buckets and 2**(n-r) hashes, then the ratio
// of p(C >= 1)/p(C == 1) is about 1/(1-2**(n-2r-1)). This uses
// the new estimator if that ratio is > 1 + 2**-8. That cutoff
// minimizes the error around the values we care about.
if (nbBits - 2.0*log2(nbH) >= 8 - 1) {
return nbH * (nbH - 1) * exp2(-nbBits-1);
}
// The probability that any given hash bucket is empty after nbH
// insertions is:
// pE = ((2**nbBits - 1)/(2**nbBits))**nbH
// so we compute:
// ln(pE) = nbH * ln((2**nbBits - 1)/(2**nbBits))
// = nbH * ln(1 - 1/2**(nbBits))
// = nbH * ln(1 - 2**(-nbBits))
// = nbH * ln(1 + -(2**(-nbBits)))
// This means the probability that any given hash bucket is
// occupied after nbH insertions is:
// pF = 1 - pE
// pF = 1 - exp(ln(pE)
// pF = -(exp(ln(pE) - 1)
// pF = -expm1(ln(pE))
// And the expected number of collisions is:
// C = m - n + n * pE
// C = m - n * (1 - pE)
// C = n * (m/n - 1 + pE)
// C = n * (m/n - (1 - pE))
// C = n * (m/n - pF)
// C = n * (m/n - (-expm1(ln(pE))))
// C = n * (m/n + expm1(ln(pE)))
// Since the format of floats/doubles is k*2**n, multiplying by
// exp2(x) doesn't lose any precision, and this formulation keeps
// m/n and pF at the same general orders of magnitude, so it tends
// to have very good precision. At low hash occupancy, pF is too
// close to m/n for this formula to work well.
double logpE = (double)nbH * log1p(-exp2(-nbBits));
double result = exp2(nbBits) * (exp2(-nbBits) * (double)nbH + expm1(logpE));
return result;
}
replies(1):