vp314 · etontackett · Mar 20, 2026 · Mar 20, 2026 · Mar 21, 2026 · Mar 21, 2026
diff --git a/Project.toml b/Project.toml
@@ -7,6 +7,7 @@ authors = ["Eton Tackett <etont@icloud.com>", "Vivak Patel <vp314@users.noreply.
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 
 [compat]
 CSV = "0.10.15"

diff --git a/docs/src/design.md b/docs/src/design.md
diff --git a/src/RidgeRegression.jl b/src/RidgeRegression.jl
@@ -1,5 +1,12 @@
 module RidgeRegression
 
-# Write your package code here.
+using CSV
+using DataFrames
+using Downloads
+using LinearAlgebra
+
+include("units.jl")
+
+export Dataset, load_csv_dataset, one_hot_encode
 
 end
diff --git a/src/units.jl b/src/units.jl
@@ -0,0 +1,145 @@
+"""
+    Dataset <: ExperimentalUnit
+
+A dataset for Ridge Regression experiements.
+
+# Description
+
+A `Dataset` object stores the design matrix ``X`` and response vector ``y``
+for a regression problem. These datasets serve as the experimental units for ridge regression experiments, allowing us to evaluate the performance of ridge regression models on various datasets.
+
+# Fields
+- `name::String`: Name of dataset
+- `X::TX`: Matrix of variables/features
+- `y::TY`: Target vector
+
+# Constructor
+
+     Dataset(name::String, X::AbstractMatrix, y::AbstractVector)
+
+## Arguments
+- `name::String`: Name of dataset
+- `X::TX`: Matrix of variables/features
+- `y::TY`: Target vector
+
+## Returns
+- A `Dataset` object containing the numeric design matrix and response vector.
+
+## Throws
+- `ArgumentError`: If rows in `X` does not equal length of `y`.
+
+!!! note
+    `Dataset` objects are used as experimental units when evaluating
+    ridge regression algorithms. The parametric design allows both dense
+    and sparse matrices to be stored without forcing conversion to a
+    dense `Matrix{Float64}`.
+"""
+struct Dataset{TX<:AbstractMatrix, TY<:AbstractVector}
+    name::String
+    X::TX
+    y::TY
+
+    function Dataset(name::String, X::TX, y::TY) where {TX<:AbstractMatrix, TY<:AbstractVector}
+        size(X, 1) == length(y) ||
+            throw(ArgumentError("X and y must have same number of rows"))
+
+        new{TX, TY}(name, X, y)
+    end
+end
+
+"""
+    one_hot_encode(Xdf::DataFrame; drop_first=true)
+
+One-hot encode categorical (string-like) features in `Xdf`.
+
+# Arguments
+- `Xdf::DataFrame`: Input DataFrame containing features and response vector `y`.
+
+# Keyword Arguments
+- `cols_to_encode`: A collection of column names or indices to one-hot encode.
+- `drop_first::Bool=true`: If `true`, drop the first dummy column for
+  each categorical feature to avoid multicollinearity.
+
+# Returns
+- `::Matrix{Float64}`: A numeric matrix containing the encoded feature.
+"""
+function one_hot_encode(Xdf::DataFrame; cols_to_encode, drop_first::Bool = true)::Matrix{Float64}
+    n = nrow(Xdf)
+    cols = Vector{Vector{Float64}}()
+    push!(cols, ones(Float64, n)) #Add a column of ones for the intercept term in the design matrix.
+    encode_names = Set(c isa Int ? Symbol(names(Xdf)[c]) : Symbol(c) for c in cols_to_encode)
+
+
+    for name in names(Xdf) #Selecting columns that aren't the target variable and pushing them to the columns.
+        col = Xdf[!, name]
+        name_sym = Symbol(name)
+        if name_sym in encode_names
+            scol = string.(col) # Convert to string for categorical processing.
+            lv = unique(scol) #Get unique category levels.
+            ind = scol .== permutedims(lv) #Create indicator matrix for each level of the categorical variable.
+            #Permutedims is used to align the dimensions for broadcasting.
+            #Broadcasting compares each element of `scol` with each level in `lv`, resulting in a matrix where each column corresponds to a level and contains `true` for rows that match that level and `false` otherwise.
+
+        if drop_first && size(ind, 2) > 1 #Drop the first column of the indicator matrix to avoid multicollinearity if drop_first is true and there are multiple levels.
+            ind = ind[:, 2:end]
+        end
+
+        for j in 1:size(ind, 2)
+            push!(cols, Float64.(ind[:, j])) #Convert the boolean indicator columns to Float64 and add them to the list of columns.
+        end
+    else
+            eltype(col) <: Real ||
+                throw(ArgumentError("Column $name must be numeric unless it is listed in cols_to_encode"))
+
+            push!(cols, Float64.(col))
+        end
+    end
+
+    p = length(cols)
+    X = Matrix{Float64}(undef, n, p)
+    for j in 1:p
+        X[:, j] = cols[j]
+    end
+
+    return Matrix{Float64}(X)
+
+end
+"""
+    load_csv_dataset(path_or_url; target_col, name="csv_dataset")
+
+Load a dataset from a CSV file or URL.
+
+# Arguments
+- `path_or_url::String`: Local file path or web URL containing CSV data.
+
+# Keyword Arguments
+- `cols_to_encode=Symbol[]`: Column names or indices in the feature data to one-hot encode.
+- `target_col`: Column index or column name containing the response variable.
+- `name::String="csv_dataset"`: Dataset name.
+
+# Returns
+- `Dataset`: A dataset containing the encoded feature matrix `X`, response vector `y`, and dataset name.
+"""
+function load_csv_dataset(path_or_url::String;  cols_to_encode=Symbol[], target_col, name::String = "csv_dataset")
+
+    filepath =
+        startswith(path_or_url, "http") ?
+        Downloads.download(path_or_url) :
+        path_or_url
+
+    df = DataFrame(CSV.File(filepath)) #Read CSV file into a DataFrame.
+    df = dropmissing(df) #Remove rows with missing values.
+    Xdf = select(df, DataFrames.Not(target_col)) #Select all columns except the target column for features.
+
+    y = target_col isa Int ?
+        df[:, target_col] : #If target_col is an integer, use it as a column index to extract the target variable from the DataFrame.
+        df[:, Symbol(target_col)] #Extract the target variable based on whether target_col is an index or a name.
+
+
+    feature_names = names(Xdf)
+    encode_cols = [c isa Int ? Symbol(names(Xdf)[c]) : Symbol(c) for c in cols_to_encode]
+    X = one_hot_encode(Xdf; cols_to_encode=encode_cols, drop_first = true)
+
+
+    return Dataset(name, X, collect(Float64, y))
+end
diff --git a/test/Project.toml b/test/Project.toml
@@ -2,4 +2,8 @@
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+
+[compat]
+CSV = "0.10"
+DataFrames = "1"
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,6 +1,20 @@
 using RidgeRegression
 using Test
+using DataFrames
+using LinearAlgebra
+using CSV
 
 @testset "RidgeRegression.jl" begin
-    # Write your tests here.
+    @testset "Dataset Tests" begin
+        include("src/units/units_dataset_tests.jl")
+    end
+
+    @testset "One-Hot Encoding Tests" begin
+        include("src/units/units_encoding_tests.jl")
+    end
+
+    @testset "Load CSV Dataset Tests" begin
+        include("src/units/units_load_csv_dataset_tests.jl")
+    end
+
 end
diff --git a/test/src/units/units_dataset_tests.jl b/test/src/units/units_dataset_tests.jl
@@ -0,0 +1,19 @@
+@testset "Dataset constructor stores fields correctly" begin
+    X = [1 2; 3 4]
+    y = [10, 20]
+    d = Dataset("toy", X, y)
+
+    @test "toy" == d.name
+    @test X == d.X
+    @test y == d.y
+    @test (2, 2) == size(d.X)
+    @test 2 == length(d.y)
+    @test 1.0 == d.X[1, 1]
+    @test 20.0 == d.y[2]
+end
+
+@testset "Dataset constructor throws error for mismatched dimensions" begin
+    X = [1 2; 3 4]
+
+    @test_throws ArgumentError Dataset("bad", X, [1, 2, 3])
+end
diff --git a/test/src/units/units_encoding_tests.jl b/test/src/units/units_encoding_tests.jl
@@ -0,0 +1,38 @@
+@testset "one_hot_encode encodes specified categorical columns and keeps numeric columns" begin
+    df = DataFrame(
+        A = ["red", "blue", "red", "green"],
+        B = [1, 2, 3, 4],
+        C = ["small", "large", "medium", "small"]
+    )
+
+    X = one_hot_encode(df; cols_to_encode=[:A, :C], drop_first=true)
+
+    @test (4, 5) == size(X)
+    @test [1.0, 2.0, 3.0, 4.0] == X[:, 3]
+    @test all(x -> x == 0.0 || x == 1.0, X[:, [1, 2, 4, 5]])
+    @test all(vec(sum(X[:, 1:2]; dims=2)) .<= 1)
+    @test all(vec(sum(X[:, 4:5]; dims=2)) .<= 1)
+end
+
+@testset "one_hot_encode throws error for invalid column specifications" begin
+    df = DataFrame(
+        A = ["red", "blue", "red", "green"],
+        B = [1, 2, 3, 4],
+        C = ["small", "large", "medium", "small"]
+    )
+
+    @test_throws ArgumentError one_hot_encode(df; cols_to_encode=[:A], drop_first=true)
+end
+
+@testset "one_hot_encode supports integer-coded categorical columns when specified" begin
+    df = DataFrame(
+        group = [1, 2, 1, 3],
+        x = [10.0, 20.0, 30.0, 40.0]
+    )
+
+    X = one_hot_encode(df; cols_to_encode=[:group], drop_first=true)
+
+    @test (4, 3) == size(X)
+    @test [10.0, 20.0, 30.0, 40.0] == X[:, 3]
+    @test all(x -> x == 0.0 || x == 1.0, X[:, 1:2])
+end
diff --git a/test/src/units/units_load_csv_dataset_tests.jl b/test/src/units/units_load_csv_dataset_tests.jl
@@ -0,0 +1,38 @@
+@testset "load_csv_dataset drops missing rows and uses target column" begin
+    tmp = tempname() * ".csv"
+
+    df = DataFrame(
+        a = [1.0, 2.0, missing, 4.0],
+        b = ["x", "y", "y", "x"],
+        y = [10.0, 20.0, 30.0, 40.0]
+    )
+
+    CSV.write(tmp, df)
+
+    d = load_csv_dataset(tmp; target_col=:y, cols_to_encode=[:b], name="tmp")
+
+    @test "tmp" == d.name
+    @test 3 == length(d.y)
+    @test 3 == size(d.X, 1)
+    @test [10.0, 20.0, 40.0] == d.y
+    @test (3, 2) == size(d.X)
+end
+
+@testset "load_csv_dataset drops missing rows and uses target column by index" begin
+    tmp = tempname() * ".csv"
+
+    df = DataFrame(
+        a = [1.0, 2.0, missing, 4.0],
+        b = ["x", "y", "y", "x"],
+        y = [10.0, 20.0, 30.0, 40.0]
+    )
+
+    CSV.write(tmp, df)
+
+    d = load_csv_dataset(tmp; target_col=3, cols_to_encode=[:b], name="tmp2")
+
+    @test "tmp2" == d.name
+    @test [10.0, 20.0, 40.0] == d.y
+    @test 3 == size(d.X, 1)
+    @test (3, 2) == size(d.X)
+end