diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d7d5180..d1e78c0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,70 +1,35 @@ name: CI on: pull_request: - branches: - - master push: - branches: - - master - tags: '*' + branches: [master] + tags: ['*'] workflow_dispatch: jobs: test: name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} runs-on: ${{ matrix.os }} - continue-on-error: ${{ matrix.version == 'nightly' }} strategy: fail-fast: false matrix: version: - '1.10' - '1' - # - 'nightly' os: - ubuntu-latest arch: - x64 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - uses: julia-actions/setup-julia@v2 with: version: ${{ matrix.version }} arch: ${{ matrix.arch }} - - uses: actions/cache@v4 - env: - cache-name: cache-artifacts - with: - path: ~/.julia/artifacts - key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }} - restore-keys: | - ${{ runner.os }}-test-${{ env.cache-name }}- - ${{ runner.os }}-test- - ${{ runner.os }}- + - uses: julia-actions/cache@v2 - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 - uses: julia-actions/julia-processcoverage@v1 - uses: codecov/codecov-action@v5 with: files: lcov.info -# docs: -# name: Documentation -# runs-on: ubuntu-latest -# steps: -# - uses: actions/checkout@v4 -# - uses: julia-actions/setup-julia@v2 -# with: -# version: '1' -# - run: | -# julia --project=docs -e ' -# using Pkg -# Pkg.develop(PackageSpec(path=pwd())) -# Pkg.instantiate()' -# - run: | -# julia --project=docs -e ' -# using Documenter: doctest -# using ForwardDiff -# doctest(ForwardDiff)' -# - run: julia --project=docs docs/make.jl -# env: -# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} -# DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} + token: ${{ secrets.CODECOV_TOKEN }} diff --git a/Project.toml b/Project.toml index 4760c76..108f072 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "FixedEffectModels" uuid = "9d5cd8c9-2029-5cab-9928-427838db53e3" -version = "1.13.0" +version = "1.13.1" [deps] DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" diff --git a/README.md b/README.md index 6808562..cb799f4 100755 --- a/README.md +++ b/README.md @@ -112,7 +112,7 @@ formula(m::FixedEffectModel) predict(m::FixedEffectModel, df) residuals(m::FixedEffectModel, df) ``` -Note that the functions `predict` and `residuals` require a table (`df`) as a second argument because the object returned by `reg` does not store the original dataset (to keep the model lightweight). For background on the tradeoff of storing the original data inside fitted model objects, see [1](http://www.r-bloggers.com/trimming-the-fat-from-glm-models-in-r/), [2](https://blogs.oracle.com/R/entry/is_the_size_of_your), [3](http://stackoverflow.com/questions/21896265/how-to-minimize-size-of-object-of-class-lm-without-compromising-it-being-passe), [4](http://stackoverflow.com/questions/15260429/is-there-a-way-to-compress-an-lm-class-for-later-prediction), [5](http://stackoverflow.com/questions/26010742/using-stargazer-with-memory-greedy-glm-objects), and [6](http://stackoverflow.com/questions/22577161/not-enough-ram-to-run-stargazer-the-normal-way). +Note that the functions `predict` and `residuals` require a table (`df`) as a second argument because the object returned by `reg` does not store the original dataset (to keep the model lightweight). For models with fixed effects, `predict` additionally requires estimating the model with `save = :fe` (or `:all`), and in-sample residual retrieval requires `save = :residuals` (or `:all`). For background on the tradeoff of storing the original data inside fitted model objects, see [1](http://www.r-bloggers.com/trimming-the-fat-from-glm-models-in-r/), [2](https://blogs.oracle.com/R/entry/is_the_size_of_your), [3](http://stackoverflow.com/questions/21896265/how-to-minimize-size-of-object-of-class-lm-without-compromising-it-being-passe), [4](http://stackoverflow.com/questions/15260429/is-there-a-way-to-compress-an-lm-class-for-later-prediction), [5](http://stackoverflow.com/questions/26010742/using-stargazer-with-memory-greedy-glm-objects), and [6](http://stackoverflow.com/questions/22577161/not-enough-ram-to-run-stargazer-the-normal-way). Finally, you can use [RegressionTables.jl](https://github.com/jmboehm/RegressionTables.jl) to get publication-quality regression tables. diff --git a/src/FixedEffectModel.jl b/src/FixedEffectModel.jl index 3bfce84..3403787 100644 --- a/src/FixedEffectModel.jl +++ b/src/FixedEffectModel.jl @@ -113,24 +113,23 @@ end # predict, residuals, modelresponse -# Utility functions for checking whether FE/continuous interactions are in formula -# These are currently not supported in predict -function is_cont_fe_int(x) +# Utility functions for checking whether FE/non-FE interactions are in formula. +# These are currently not supported in predict. +function is_cont_fe_int(x) x isa InteractionTerm || return false - any(x -> isa(x, Term), x.terms) && any(x -> isa(x, FunctionTerm{typeof(fe), Vector{Term}}), x.terms) + any(has_fe, x.terms) && any(t -> !has_fe(t), x.terms) end -# Does the formula have InteractionTerms? +has_cont_fe_interaction(::AbstractTerm) = false +has_cont_fe_interaction(x::InteractionTerm) = is_cont_fe_int(x) +has_cont_fe_interaction(x::NTuple{N, AbstractTerm}) where {N} = any(has_cont_fe_interaction, x) function has_cont_fe_interaction(x::FormulaTerm) - if x.rhs isa AbstractTerm # only one term - is_cont_fe_int(x) - elseif hasfield(typeof(x.rhs), :lhs) # Is an IV term - false # Is this correct? - else - any(is_cont_fe_int, x.rhs) - end + has_cont_fe_interaction(x.lhs) || any(has_cont_fe_interaction, eachterm(x.rhs)) end +_is_no_regressor_rhs(rhs) = rhs == MatrixTerm((InterceptTerm{false}(),)) +_is_intercept_only_rhs(rhs) = rhs == MatrixTerm((InterceptTerm{true}(),)) + function StatsAPI.predict(m::FixedEffectModel, data) Tables.istable(data) || throw(ArgumentError("Expected second argument to be a Table, got $(typeof(data))")) @@ -140,11 +139,13 @@ function StatsAPI.predict(m::FixedEffectModel, data) # only fixed effects cdata = StatsModels.columntable(data) - nrows = length(Tables.rows(cdata)) - if m.formula_schema.rhs == MatrixTerm((InterceptTerm{false}(),)) - has_fe(m) || throw(ArgumentError("To be used with predict, a model requires regressors or fixed effects")) + nrows = length(Tables.rows(data)) + if _is_no_regressor_rhs(m.formula_schema.rhs) out = zeros(Float64, nrows) nonmissings = trues(nrows) + elseif _is_intercept_only_rhs(m.formula_schema.rhs) + out = fill(only(m.coef), nrows) + nonmissings = trues(nrows) else cols, nonmissings = StatsModels.missing_omit(cdata, m.formula_schema.rhs) Xnew = modelmatrix(m.formula_schema, cols) @@ -186,14 +187,26 @@ function StatsAPI.residuals(m::FixedEffectModel, data) has_fe(m) && throw(ArgumentError("To access residuals for a model with high-dimensional fixed effects, run `m = reg(..., save = :residuals)` and then access residuals with `residuals(m)`.")) cdata = StatsModels.columntable(data) - cols, nonmissings = StatsModels.missing_omit(cdata, m.formula_schema.rhs) - Xnew = modelmatrix(m.formula_schema, cols) y = response(m.formula_schema, cdata) - if all(nonmissings) - out = y - Xnew * m.coef + if _is_no_regressor_rhs(m.formula_schema.rhs) + out = y + elseif _is_intercept_only_rhs(m.formula_schema.rhs) + observed = .!ismissing.(y) + if all(observed) + out = y .- only(m.coef) + else + out = Vector{Union{Float64, Missing}}(missing, length(y)) + out[observed] = y[observed] .- only(m.coef) + end else - out = Vector{Union{Float64, Missing}}(missing, length(Tables.rows(cdata))) - out[nonmissings] = y - Xnew * m.coef + cols, nonmissings = StatsModels.missing_omit(cdata, m.formula_schema.rhs) + Xnew = modelmatrix(m.formula_schema, cols) + if all(nonmissings) + out = y - Xnew * m.coef + else + out = Vector{Union{Float64, Missing}}(missing, length(Tables.rows(data))) + out[nonmissings] = y - Xnew * m.coef + end end return out end @@ -361,4 +374,3 @@ function StatsModels.apply_schema(t::FormulaTerm, schema::StatsModels.Schema, Mo FormulaTerm(apply_schema(t.lhs, schema.schema, StatisticalModel), StatsModels.collect_matrix_terms(apply_schema(t.rhs, schema, StatisticalModel))) end - diff --git a/src/fit.jl b/src/fit.jl index fe44399..d7790a3 100644 --- a/src/fit.jl +++ b/src/fit.jl @@ -60,7 +60,7 @@ function reg(df, progress_bar::Bool = true, subset::Union{Nothing, AbstractVector} = nothing, first_stage::Bool = true) - StatsAPI.fit(FixedEffectModel, formula, df, vcov; contrasts = contrasts, weights = weights, save = save, method = method, double_precision = double_precision, tol = tol, maxiter = maxiter, drop_singletons = drop_singletons, progress_bar = progress_bar, subset = subset, first_stage = first_stage) + StatsAPI.fit(FixedEffectModel, formula, df, vcov; contrasts = contrasts, weights = weights, save = save, method = method, nthreads = nthreads, double_precision = double_precision, tol = tol, maxiter = maxiter, drop_singletons = drop_singletons, progress_bar = progress_bar, subset = subset, first_stage = first_stage) end function StatsAPI.fit(::Type{FixedEffectModel}, diff --git a/src/partial_out.jl b/src/partial_out.jl index 09caeec..c1d7f98 100644 --- a/src/partial_out.jl +++ b/src/partial_out.jl @@ -42,6 +42,10 @@ function partial_out( @nospecialize(tol::Real = double_precision ? 1e-8 : 1e-6), @nospecialize(align = true)) + # Normalize generic Tables.jl input once; the rest of the function uses + # DataFrame indexing/views, and copycols = false avoids copies when possible. + df = DataFrame(df; copycols = false) + if (ConstantTerm(0) ∉ eachterm(f.rhs)) && (ConstantTerm(1) ∉ eachterm(f.rhs)) f = FormulaTerm(f.lhs, tuple(ConstantTerm(1), eachterm(f.rhs)...)) end @@ -53,8 +57,10 @@ function partial_out( # create a dataframe without missing values & negative weights - all_vars = StatsModels.termvars(formula) - esample = completecases(df[!, all_vars]) + main_vars = unique(StatsModels.termvars(formula)) + fe_vars = unique(StatsModels.termvars(formula_fes)) + all_vars = unique(vcat(main_vars, fe_vars)) + esample = completecases(df, all_vars) if has_weights esample .&= BitArray(!ismissing(x) && (x > 0) for x in df[!, weights]) end diff --git a/test/fit.jl b/test/fit.jl index 6e70b3b..d58f7a7 100644 --- a/test/fit.jl +++ b/test/fit.jl @@ -750,3 +750,31 @@ end x = reg(df1, @formula(a ~ b + fe(c))) @test coef(x) ≈ [0.5] atol = 1e-4 end + +@testset "keyword arguments" begin + df = DataFrame(y = [1.0, 2.0, 2.5], x = [0.0, 1.0, 2.0]) + @test_logs (:info, r"The keyword argument nthreads is deprecated") reg(df, @formula(y ~ x), nthreads = 1) + @test_logs (:info, r"method = :gpu is deprecated") reg(df, @formula(y ~ x), method = :gpu) +end + +@testset "error handling" begin + df = DataFrame(y = [1.0, 2.0, 3.0], x = [0.0, 1.0, 2.0], z = [1.0, 2.0, 3.0], w = [1.0, 2.0, Inf]) + + @test_throws ArgumentError reg(df, @formula(y ~ x), save = :bad) + @test_throws DimensionMismatch reg(df, @formula(y ~ x), subset = [true, false]) + @test_throws ArgumentError reg(df, @formula(y ~ x), subset = falses(3)) + @test_throws ArgumentError adjr2(reg(df, @formula(y ~ x)), :bad) + @test_throws ArgumentError reg(df, @formula(y ~ x), weights = :w) + + df_yinf = DataFrame(y = [1.0, Inf, 3.0], x = [0.0, 1.0, 2.0]) + @test_throws ArgumentError reg(df_yinf, @formula(y ~ x)) + + df_xinf = DataFrame(y = [1.0, 2.0, 3.0], x = [0.0, Inf, 2.0]) + @test_throws ArgumentError reg(df_xinf, @formula(y ~ x)) + + df_endoinf = DataFrame(y = [1.0, 2.0, 3.0], x = [0.0, Inf, 2.0], z = [1.0, 2.0, 3.0]) + @test_throws ArgumentError reg(df_endoinf, @formula(y ~ (x ~ z))) + + df_ivinf = DataFrame(y = [1.0, 2.0, 3.0], x = [0.0, 1.0, 2.0], z = [1.0, Inf, 3.0]) + @test_throws ArgumentError reg(df_ivinf, @formula(y ~ (x ~ z))) +end diff --git a/test/partial_out.jl b/test/partial_out.jl index d8da23f..c1903e2 100644 --- a/test/partial_out.jl +++ b/test/partial_out.jl @@ -22,5 +22,21 @@ using FixedEffectModels, DataFrames, Statistics, CSV, Test @test Matrix(partial_out(df, @formula(Sales + Price ~ NDI + fe(State)), weights = :Pop)[1])[1:5, :] ≈ [ -22.2429 -1.2635 ; -20.5296 -1.1515 ; -17.2164 -2.23949; -19.1378 -1.45057; -19.854 -2.28819] atol = 1e-3 @test Matrix(partial_out(df, @formula(Sales + Price ~ 1 + fe(State)), weights = :Pop)[1])[1:5, :] ≈ [ -14.0383 -43.1224; -12.5383 -41.9224; -9.43825 -41.9224; -11.5383 -40.2224; -12.4383 -40.1224] atol = 1e-3 @test Matrix(partial_out(df, @formula(Sales + Price ~ 1), weights = :Pop)[1])[1:5, :] ≈ [ -26.3745 -44.9103; -24.8745 -43.7103; -21.7745 -43.7103; -23.8745 -42.0103; -24.7745 -41.9103] atol = 1e-3 + + df_missing = DataFrame(y = [1.0, 2.0, 3.0, 4.0], g = ["a", missing, "a", "b"]) + out_missing, esample_missing, _, _ = partial_out(df_missing, @formula(y ~ 1 + fe(g))) + out_nomissing, esample_nomissing, _, _ = partial_out(dropmissing(df_missing, :g), @formula(y ~ 1 + fe(g)); align = false) + @test esample_missing == [true, false, true, true] + @test all(esample_nomissing) + @test ismissing(out_missing.y[2]) + @test collect(skipmissing(out_missing.y)) ≈ out_nomissing.y +end + +@testset "partial out errors" begin + df = DataFrame(y = [1.0, 2.0, 3.0], x = [0.0, 1.0, 2.0], z = [1.0, 2.0, 3.0], w = [1.0, 2.0, Inf]) + + @test_throws ArgumentError partial_out(df, @formula(y ~ (x ~ z))) + @test_throws ArgumentError partial_out(df, @formula(y ~ x), weights = :w) + @test_throws ArgumentError partial_out(DataFrame(y = [missing, missing], x = [1.0, 2.0]), @formula(y ~ x)) end diff --git a/test/predict.jl b/test/predict.jl index bdfa6d0..9f64f63 100644 --- a/test/predict.jl +++ b/test/predict.jl @@ -1,4 +1,5 @@ using FixedEffectModels, DataFrames, CategoricalArrays, CSV, Test +using CUDA, Metal @@ -38,6 +39,17 @@ using FixedEffectModels, DataFrames, CategoricalArrays, CSV, Test end @testset "Predict" begin + # Intercept only + df = DataFrame(y = [1.0, 2.0, 3.0]) + m = reg(df, @formula(y ~ 1)) + pred = predict(m, df) + @test pred ≈ fill(2.0, 3) + + # No regressors + m = reg(df, @formula(y ~ 0)) + pred = predict(m, df) + @test pred ≈ zeros(3) + # Simple - one binary FE df = DataFrame(x = rand(100), g = rand(["a", "b"], 100)) df.y = 1.0 .+ 0.5 .* df.x .+ (df.g .== "b") @@ -155,6 +167,12 @@ end m = reg(df, @formula(y ~ x + fe(g1) + fe(g2)&z + fe(g1)&x); save = :fe) @test_throws ArgumentError pred = predict(m, df) + # FE/continuous interaction as the only regressor + df = DataFrame(g = rand(["a", "b"], 100), z = rand(100)) + df.y = 1.5 .+ 2.0 .* (df.g .== "b") .* df.z + m = reg(df, @formula(y ~ fe(g)&z); save = :fe) + @test_throws ArgumentError pred = predict(m, df) + # Regular FE + interacted FE + FE/continuous interaction df = DataFrame(x = rand(100), g1 = rand(["a", "b"], 100), g2 = rand(["c", "d"], 100), g3 = rand(["e", "f"], 100), g4 = rand(["g", "h"], 100), z = rand(100)) @@ -171,6 +189,20 @@ end @test all(skipmissing(out1 .≈ out2)) end +@testset "predict/residual error paths" begin + df = DataFrame(y = [1.0, 2.0, 3.0], x = [0.0, 1.0, 2.0], g = ["a", "a", "b"]) + + @test_throws ArgumentError predict(reg(df, @formula(y ~ x)), 1) + @test_throws ArgumentError residuals(reg(df, @formula(y ~ x)), 1) + @test_throws ArgumentError fe(reg(df, @formula(y ~ x))) + @test_throws ArgumentError residuals(reg(df, @formula(y ~ x))) + + m_fe = reg(df, @formula(y ~ x + fe(g))) + @test_throws ArgumentError predict(m_fe, df) + @test_throws ArgumentError residuals(m_fe, df) + @test_throws ArgumentError residuals(m_fe) +end + @testset "Continuous/FE detection" begin # Regular interaction is fine as handled by StatsModels @test FixedEffectModels.has_cont_fe_interaction(@formula(y ~ x + y&z)) == false @@ -179,6 +211,7 @@ end @test FixedEffectModels.has_cont_fe_interaction(@formula(y ~ x + fe(y)&fe(z))) == false # Interaction of FEs with continuous variable requires special handling, currently not implemented + @test FixedEffectModels.has_cont_fe_interaction(@formula(y ~ fe(y)&z)) == true @test FixedEffectModels.has_cont_fe_interaction(@formula(y ~ x + fe(y)&z)) == true @test FixedEffectModels.has_cont_fe_interaction(@formula(y ~ x + y&fe(z))) == true @test FixedEffectModels.has_cont_fe_interaction(@formula(y ~ x + fe(y)&fe(z)&a)) == true @@ -190,6 +223,11 @@ end @testset "residuals" begin + df = DataFrame(y = [1.0, 2.0, 3.0]) + model = @formula y ~ 1 + result = reg(df, model) + @test residuals(result, df) ≈ [-1.0, 0.0, 1.0] + df = DataFrame(CSV.File(joinpath(dirname(pathof(FixedEffectModels)), "../dataset/Cigar.csv"))) df.StateC = categorical(df.State) @@ -408,5 +446,3 @@ end @test fe(result)[1, :fe_Year] ≈ 164.7 atol = 1e-1 end end - -