@@ -3,6 +3,8 @@ package is.hail
33import is .hail .types .physical .{PCanonicalStruct , PFloat64 }
44import is .hail .utils ._
55
6+ import scala .annotation .tailrec
7+
68import net .sourceforge .jdistlib .{Beta , ChiSquare , Gamma , NonCentralChiSquare , Normal , Poisson }
79import net .sourceforge .jdistlib .disttest .{DistributionTest , TestKind }
810import org .apache .commons .math3 .distribution .HypergeometricDistribution
@@ -162,45 +164,28 @@ package object stats {
162164 )
163165
164166 def fisherExactTest (a : Int , b : Int , c : Int , d : Int ): Array [Double ] =
165- fisherExactTest(a, b, c, d, 1.0 , 0.95 , " two.sided" )
166-
167- def fisherExactTest (
168- a : Int ,
169- b : Int ,
170- c : Int ,
171- d : Int ,
172- oddsRatio : Double = 1d ,
173- confidenceLevel : Double = 0.95 ,
174- alternative : String = " two.sided" ,
175- ): Array [Double ] = {
167+ fisherExactTest(a, b, c, d, 0.95 )
176168
169+ def fisherExactTest (a : Int , b : Int , c : Int , d : Int , confidenceLevel : Double ): Array [Double ] = {
177170 if (! (a >= 0 && b >= 0 && c >= 0 && d >= 0 ))
178171 fatal(s " fisher_exact_test: all arguments must be non-negative, got $a, $b, $c, $d" )
179172
180173 if (confidenceLevel < 0d || confidenceLevel > 1d )
181174 fatal(" Confidence level must be between 0 and 1" )
182175
183- if (oddsRatio < 0d )
184- fatal(" Odds ratio must be non-negative" )
185-
186- if (alternative != " greater" && alternative != " less" && alternative != " two.sided" )
187- fatal(" Did not recognize test type string. Use one of greater, less, two.sided" )
188-
189176 val popSize = a + b + c + d
190- val numSuccessPopulation = a + c
191- val sampleSize = a + b
177+ val nGood = a + c
178+ val nSample = a + b
192179 val numSuccessSample = a
193180
194- if (
195- ! (popSize > 0 && sampleSize > 0 && sampleSize < popSize && numSuccessPopulation > 0 && numSuccessPopulation < popSize)
196- )
181+ if (! (popSize > 0 && nSample > 0 && nSample < popSize && nGood > 0 && nGood < popSize))
197182 return Array (Double .NaN , Double .NaN , Double .NaN , Double .NaN )
198183
199184 val low = math.max(0 , (a + b) - (b + d))
200185 val high = math.min(a + b, a + c)
201186 val support = (low to high).toArray
202187
203- val hgd = new HypergeometricDistribution (null , popSize, numSuccessPopulation, sampleSize )
188+ val hgd = new HypergeometricDistribution (null , popSize, nGood, nSample )
204189 val epsilon = 2.220446e-16
205190
206191 def dhyper (k : Int , logProb : Boolean ): Double =
@@ -320,36 +305,74 @@ package object stats {
320305 }
321306 }
322307
323- val pvalue : Double = (alternative : @ unchecked) match {
324- case " less" => pnhyper(numSuccessSample, oddsRatio)
325- case " greater" => pnhyper(numSuccessSample, oddsRatio, upper_tail = true )
326- case " two.sided" =>
327- if (oddsRatio == 0 )
328- if (low == numSuccessSample) 1d else 0d
329- else if (oddsRatio == Double .PositiveInfinity )
330- if (high == numSuccessSample) 1d else 0d
331- else {
332- val relErr = 1d + 1e-7
333- val d = dnhyper(oddsRatio)
334- d.filter(_ <= d(numSuccessSample - low) * relErr).sum
335- }
336- }
337-
338- assert(pvalue >= 0d && pvalue <= 1.000000000002 )
308+ val pvalue = fisherExactTestPValueOnly(a, b, c, d)
339309
340310 val oddsRatioEstimate = mle(numSuccessSample.toDouble)
341311
342- val confInterval = alternative match {
343- case " less" => (0d , ncpUpper(numSuccessSample, 1 - confidenceLevel))
344- case " greater" => (ncpLower(numSuccessSample, 1 - confidenceLevel), Double .PositiveInfinity )
345- case " two.sided" =>
346- val alpha = (1 - confidenceLevel) / 2d
347- (ncpLower(numSuccessSample, alpha), ncpUpper(numSuccessSample, alpha))
312+ val confInterval = {
313+ val alpha = (1 - confidenceLevel) / 2d
314+ (ncpLower(numSuccessSample, alpha), ncpUpper(numSuccessSample, alpha))
348315 }
349316
350317 Array (pvalue, oddsRatioEstimate, confInterval._1, confInterval._2)
351318 }
352319
320+ def fisherExactTestPValueOnly (a : Int , b : Int , c : Int , d : Int ): Double = {
321+ val popSize = a + b + c + d
322+ val nGood = a + c
323+ val nSample = a + b
324+ val numSuccessSample = a
325+
326+ if (! (a >= 0 && b >= 0 && c >= 0 && d >= 0 ))
327+ fatal(s " fisher_exact_test: all arguments must be non-negative, got $a, $b, $c, $d" )
328+
329+ if (! (popSize > 0 && nSample > 0 && nSample < popSize && nGood > 0 && nGood < popSize))
330+ return Double .NaN
331+
332+ val hgd = new HypergeometricDistribution (null , popSize, nGood, nSample)
333+
334+ // Returns i in [start, end] such that a([start, i)) is <= d, and a([i, end)) is > d
335+ @ tailrec def upperBoundIncreasing (a : Int => Double , d : Double , start : Int , end : Int ): Int = {
336+ if (start >= end) return start
337+ val mid = (start + end) >>> 1
338+ val elt = a(mid)
339+ if (elt <= d) upperBoundIncreasing(a, d, mid + 1 , end)
340+ else upperBoundIncreasing(a, d, start, mid)
341+ }
342+
343+ // Returns i in [start, end] such that a([start, i)) is > d, and a([i, end)) is <= d
344+ @ tailrec def lowerBoundDecreasing (a : Int => Double , d : Double , start : Int , end : Int ): Int = {
345+ if (start >= end) return start
346+ val mid = (start + end) >>> 1
347+ val elt = a(mid)
348+ if (elt > d) lowerBoundDecreasing(a, d, mid + 1 , end)
349+ else lowerBoundDecreasing(a, d, start, mid)
350+ }
351+
352+ val epsilon = 1e-14
353+ val gamma = 1 + epsilon
354+
355+ val mode = ((nSample + 1.0 ) * (nGood + 1.0 ) / (popSize + 2.0 )).toInt
356+ val pexact = hgd.probability(numSuccessSample)
357+ val pmode = hgd.probability(mode)
358+
359+ val pvalue = if (math.abs(pexact - pmode) / math.max(pexact, pmode) <= epsilon) {
360+ 1.0
361+ } else if (numSuccessSample < mode) {
362+ val plower = hgd.cumulativeProbability(numSuccessSample)
363+ val bound = lowerBoundDecreasing(hgd.probability, pexact * gamma, mode + 1 , nSample + 1 )
364+ plower + hgd.upperCumulativeProbability(bound)
365+ } else {
366+ val pupper = hgd.upperCumulativeProbability(numSuccessSample)
367+ val bound = upperBoundIncreasing(hgd.probability, pexact * gamma, 0 , mode)
368+ pupper + hgd.cumulativeProbability(bound - 1 )
369+ }
370+
371+ assert(pvalue >= 0d && pvalue <= 1.000000000002 )
372+
373+ pvalue
374+ }
375+
353376 def dnorm (x : Double , mu : Double , sigma : Double , logP : Boolean ): Double =
354377 Normal .density(x, mu, sigma, logP)
355378
0 commit comments