←back to thread

140 points subset | 2 comments | | HN request time: 0.553s | source
Show context
rurban ◴[] No.44376127[source]
You can do bigint's or you can do approximations. I've tried about 8 different implementations in smhasher. (for 32bit to 256bit) See this massacre:

    // Naive multiplication, no accuracy at all
    static double ExpectedNBCollisions_Slow ( const double nbH, const double nbBits )
    {
      long balls = nbH;
      long double bins = nbBits;
      long double result = 1.0;
      for (long i = 1; i < balls / 2; i++) {
        // take a pair from the front and the end to minimize errors
        result *= ((bins - i) / bins) * ((bins - (nbH - i)) / bins);
      }
      return (double)(nbH * result);
    }
    
    // TODO This only works for a low number of collisions
    static inline double ExpectedCollisions ( const double balls, const double bins )
    {
      return balls - (bins * (1 - pow((bins - 1)/bins, balls)));
    }
    
    // Still too inaccurate: https://preshing.com/20110504/hash-collision-probabilities/
    static double EstimateNbCollisions_Taylor(const double nbH, const double nbBits)
    {
      const long double k = nbH;
      const long double b = nbBits;
      return (double)(k * (1.0 - expl(-0.5 * k * (k - 1.0) / b)));
    }
    
    // demerphq: (double(count) * double(count-1)) / pow(2.0,double(sizeof(hashtype) * 8 + 1));
    // the very same as our calc. pow 2 vs exp2. Just the high cutoff is missing here.
    static double EstimateNbCollisions_Demerphq(const double nbH, const double nbBits)
    {
      return (nbH * (nbH - 1)) / pow(2.0, nbBits + 1);
    }
    
    // GNU R: qbirthday. rough estimate. FIXME
    static double EstimateNbCollisions_R(const double nbH, const double nbBits)
    {
      return ceil(exp(((log(nbH) + lgamma(3) + log(-log1p(-0.5)))) / 2));
    }
    
    // GNU R: pbirthday. FIXME
    /*
      static double EstimateNbCollisions_Rp(const double c)
    {
      return (1 - prod((c:(c-0.5+1))/rep(2, 0.5)));
    }
    */
    
    // The previous best calculation, highly prone to inaccuracies with low results (1.0 - 10.0)
    // TODO: return also the error.
    static double EstimateNbCollisions_previmpl(const double nbH, const double nbBits)
    {
      double exp = exp2(nbBits); // 2 ^ bits
      double result = (nbH * (nbH-1)) / (2.0 * exp);
      if (result > nbH)
        result = nbH;
      // improved floating point accuracy
      if (result <= exp || nbBits > 32)
        return result;
      return result - exp;
    }
    
    static double EstimateNbCollisions_fwojcik(const double nbH, const int nbBits)
    {
        // If the probability that there are 1 or more collisions (p(C >=
        // 1)) is not much higher than the probability of exactly 1
        // collision (p(C == 1)), then the classically-good approximation
        // of the probability of any collisions is also a good estimate
        // for the expected number of collisions.
        //
        // If there are 2**n buckets and 2**(n-r) hashes, then the ratio
        // of p(C >= 1)/p(C == 1) is about 1/(1-2**(n-2r-1)). This uses
        // the new estimator if that ratio is > 1 + 2**-8. That cutoff
        // minimizes the error around the values we care about.
        if (nbBits - 2.0*log2(nbH) >= 8 - 1) {
            return nbH * (nbH - 1) * exp2(-nbBits-1);
        }
    
        // The probability that any given hash bucket is empty after nbH
        // insertions is:
        //    pE     = ((2**nbBits - 1)/(2**nbBits))**nbH
        // so we compute:
        //    ln(pE) = nbH * ln((2**nbBits - 1)/(2**nbBits))
        //           = nbH * ln(1 - 1/2**(nbBits))
        //           = nbH * ln(1 - 2**(-nbBits))
        //           = nbH * ln(1 + -(2**(-nbBits)))
        // This means the probability that any given hash bucket is
        // occupied after nbH insertions is:
        //     pF = 1 - pE
        //     pF = 1 - exp(ln(pE)
        //     pF = -(exp(ln(pE) - 1)
        //     pF = -expm1(ln(pE))
        // And the expected number of collisions is:
        //     C = m - n + n * pE
        //     C = m - n * (1 - pE)
        //     C = n * (m/n - 1 + pE)
        //     C = n * (m/n - (1 - pE))
        //     C = n * (m/n - pF)
        //     C = n * (m/n - (-expm1(ln(pE))))
        //     C = n * (m/n + expm1(ln(pE)))
        // Since the format of floats/doubles is k*2**n, multiplying by
        // exp2(x) doesn't lose any precision, and this formulation keeps
        // m/n and pF at the same general orders of magnitude, so it tends
        // to have very good precision. At low hash occupancy, pF is too
        // close to m/n for this formula to work well.
        double logpE = (double)nbH  * log1p(-exp2(-nbBits));
        double result = exp2(nbBits) * (exp2(-nbBits) * (double)nbH + expm1(logpE));
    
        return result;
    }
replies(1): >>44381076 #
johnisgood ◴[] No.44381076[source]
So which is the best and why?
replies(1): >>44381623 #
1. rurban ◴[] No.44381623[source]
The last by fwojcik, because it's the most accurate for our sizes, and pretty fast also.
replies(1): >>44382136 #
2. johnisgood ◴[] No.44382136[source]
Thanks.

By the way, this is extremely off-topic, but would you mind correcting me were I wrong anywhere here https://news.ycombinator.com/item?id=44359539?

I probably should have sent an e-mail.