diff --git a/src/FSharp.Stats/Testing/ChiSquareTest.fs b/src/FSharp.Stats/Testing/ChiSquareTest.fs index a701711d..324dd023 100644 --- a/src/FSharp.Stats/Testing/ChiSquareTest.fs +++ b/src/FSharp.Stats/Testing/ChiSquareTest.fs @@ -37,20 +37,62 @@ open FSharp.Stats type ChiSquareTest = - /// Computes the Chi-Square test - /// n data points -> degrees of freedom = n - 1 + /// + /// Computes the Chi-Square goodness-of-fit test. + /// n data points -> degrees of freedom = n - 1 + /// static member compute (degreesOfFreedom:int) (expected:seq) (observed:seq) = - //let chechParams = - // if expected |> Seq.exists (fun x -> abs x < 5.) then printfn "Warning: A value less than 5 is present in expected values. Results may not be correct!" - // let sumEx = Seq.sum expected - // let sumOb = Seq.sum observed - // if Math.Round(sumEx,1) <> Math.Round(sumOb,1) then printfn "Warning: The sum of observed values does not match the sum of expected values. SumEx: %.3f SumOb: %.3f" sumEx sumOb let chi2 = Seq.zip observed expected |> Seq.fold (fun acc (obs,exp) -> let d = obs - exp acc + (d * d) / exp) 0.0 - + TestStatistics.createChiSquare chi2 (float degreesOfFreedom) + + /// + /// Computes the Chi-Square goodness-of-fit test with Yates's continuity correction. + /// + /// + /// Yates's correction subtracts 0.5 from each |observed - expected| term before squaring. + /// It is recommended when the degrees of freedom equal 1 (two categories) and expected + /// cell counts are small. For df > 1 or large samples the uncorrected compute is + /// preferable. + /// + /// Reference: Yates, F. (1934). Contingency tables involving small numbers and the chi-squared + /// test. Supplement to the Journal of the Royal Statistical Society, 1(2), 217-235. + /// + static member computeWithYates (degreesOfFreedom:int) (expected:seq) (observed:seq) = + let chi2 = + Seq.zip observed expected + |> Seq.fold (fun acc (obs,exp) -> + let diff = abs (obs - exp) - 0.5 + acc + (diff * diff) / exp) 0.0 + TestStatistics.createChiSquare chi2 (float degreesOfFreedom) + + /// + /// Computes the Chi-Square goodness-of-fit test with Williams's correction. + /// + /// + /// Williams's correction divides the chi-square statistic by + /// q = 1 + (k^2 - 1) / (6 * n * k), where k is the number of categories and + /// n is the total observed count. This provides a better approximation to the + /// chi-squared distribution when sample sizes are small. + /// + /// Reference: Williams, D. A. (1976). Improved likelihood ratio tests for complete + /// contingency tables. Biometrika, 63(1), 33-37. + /// + static member computeWithWilliams (degreesOfFreedom:int) (expected:seq) (observed:seq) = + let observedArr = Seq.toArray observed + let expectedArr = Seq.toArray expected + let k = float observedArr.Length + let n = Array.sum observedArr + let q = 1.0 + (k * k - 1.0) / (6.0 * n * k) + let chi2Raw = + Array.zip observedArr expectedArr + |> Array.fold (fun acc (obs,exp) -> + let d = obs - exp + acc + (d * d) / exp) 0.0 + let chi2 = chi2Raw / q TestStatistics.createChiSquare chi2 (float degreesOfFreedom) static member pearsonChiSquared (table:ContingencyTable<_,_>) = diff --git a/tests/FSharp.Stats.Tests/Testing.fs b/tests/FSharp.Stats.Tests/Testing.fs index 1adfd493..083c3f6b 100644 --- a/tests/FSharp.Stats.Tests/Testing.fs +++ b/tests/FSharp.Stats.Tests/Testing.fs @@ -384,12 +384,49 @@ let chiSquaredTests = let df = expected.Length - 1 ChiSquareTest.compute df expected observed + // computeWithYates: + // R: obs <- c(45, 55); chisq.test(obs, p=c(0.5,0.5), correct=TRUE) + // Chi-squared = 0.81, p-value = 0.3681 + let testCaseYates1 = + let expected = [50.0; 50.0] + let observed = [45.0; 55.0] + ChiSquareTest.computeWithYates 1 expected observed + + // R: obs <- c(10, 20); chisq.test(obs, p=c(0.5,0.5), correct=TRUE) + // Chi-squared = 2.7, p-value = 0.1003 + let testCaseYates2 = + let expected = [15.0; 15.0] + let observed = [10.0; 20.0] + ChiSquareTest.computeWithYates 1 expected observed + + // computeWithWilliams: + // Williams q = 1 + (k^2-1)/(6*n*k); k=4, n=556 => q ≈ 1.001124 + // chi2_williams = 0.4700 / 1.001124 ≈ 0.4695 + let testCaseWilliams1 = + let expected = [312.75;104.25;104.25;34.75] + let observed = [315.;101.;108.;32.] + let df = expected.Length - 1 + ChiSquareTest.computeWithWilliams df expected observed + + // k=3, n=45 => q ≈ 1.009877; raw chi2 ≈ 3.3333 => williams ≈ 3.3007 + let testCaseWilliams2 = + let expected = [15.0; 15.0; 15.0] + let observed = [10.0; 20.0; 15.0] + let df = expected.Length - 1 + ChiSquareTest.computeWithWilliams df expected observed + testList "Testing.ChiSquaredTest" [ testCase "compute" <| fun () -> Expect.isTrue (0.9254 = Math.Round(testCase1.PValueRight,4)) "pValue should be equal." Expect.isTrue (0.4700 = Math.Round(testCase1.Statistic,4)) "statistic should be equal." Expect.isTrue (0.000638 = Math.Round(testCase2.PValueRight,6)) "pValue should be equal." Expect.isTrue (19.461 = Math.Round(testCase2.Statistic,3)) "statistic should be equal." + testCase "computeWithYates" <| fun () -> + Expect.floatClose Accuracy.medium testCaseYates1.Statistic 0.81 "Yates statistic should be 0.81" + Expect.floatClose Accuracy.medium testCaseYates2.Statistic 2.7 "Yates statistic should be 2.7" + testCase "computeWithWilliams" <| fun () -> + Expect.floatClose Accuracy.medium testCaseWilliams1.Statistic 0.469496 "Williams statistic should be ~0.4695" + Expect.floatClose Accuracy.medium testCaseWilliams2.Statistic 3.300733 "Williams statistic should be ~3.3007" ]