Skip to content
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ authors = ["Eton Tackett <etont@icloud.com>", "Vivak Patel <vp314@users.noreply.
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"

[compat]
CSV = "0.10.15"
Expand Down
170 changes: 41 additions & 129 deletions docs/src/design.md

Large diffs are not rendered by default.

9 changes: 8 additions & 1 deletion src/RidgeRegression.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
module RidgeRegression

# Write your package code here.
using CSV
using DataFrames
using Downloads
using LinearAlgebra

include("units.jl")

export Dataset, load_csv_dataset, one_hot_encode

end
145 changes: 145 additions & 0 deletions src/units.jl

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All dependencies should appear in the Project.toml file. You should activate the package environment and then "add ..." your dependencies to ensure compatibility and correct environment for the package.

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your data struct for the experimental unit should correspond to the design. It should have
\lambda, n and p as fields. While n and p can be computed, either there should be a convenience function to compute them or they should be explicit fields in the unit.

Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
"""
Dataset <: ExperimentalUnit

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dataset is not a subtype of "ExperimentalUnit" as experimental unit does not exist. This signature should reflect the exact parametric type of Dataset{ }

May want to consider renaming it Unit{ }


A dataset for Ridge Regression experiements.
Comment thread
etontackett marked this conversation as resolved.

# Description

A `Dataset` object stores the design matrix ``X`` and response vector ``y``
for a regression problem. These datasets serve as the experimental units for ridge regression experiments, allowing us to evaluate the performance of ridge regression models on various datasets.
Comment thread
etontackett marked this conversation as resolved.

# Fields
- `name::String`: Name of dataset
- `X::TX`: Matrix of variables/features
- `y::TY`: Target vector

# Constructor

Dataset(name::String, X::AbstractMatrix, y::AbstractVector)

## Arguments
- `name::String`: Name of dataset
- `X::TX`: Matrix of variables/features
- `y::TY`: Target vector

## Returns
- A `Dataset` object containing the numeric design matrix and response vector.

## Throws
- `ArgumentError`: If rows in `X` does not equal length of `y`.
Comment thread
etontackett marked this conversation as resolved.

!!! note
`Dataset` objects are used as experimental units when evaluating
ridge regression algorithms. The parametric design allows both dense
and sparse matrices to be stored without forcing conversion to a
dense `Matrix{Float64}`.
"""
struct Dataset{TX<:AbstractMatrix, TY<:AbstractVector}
name::String
X::TX
y::TY

function Dataset(name::String, X::TX, y::TY) where {TX<:AbstractMatrix, TY<:AbstractVector}
size(X, 1) == length(y) ||
throw(ArgumentError("X and y must have same number of rows"))

new{TX, TY}(name, X, y)
end
end

"""
one_hot_encode(Xdf::DataFrame; drop_first=true)

One-hot encode categorical (string-like) features in `Xdf`.

# Arguments
- `Xdf::DataFrame`: Input DataFrame containing features and response vector `y`.

# Keyword Arguments
- `cols_to_encode`: A collection of column names or indices to one-hot encode.
- `drop_first::Bool=true`: If `true`, drop the first dummy column for
each categorical feature to avoid multicollinearity.
Comment thread
etontackett marked this conversation as resolved.

# Returns
- `::Matrix{Float64}`: A numeric matrix containing the encoded feature.
"""

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need a section in the docstring called "# Throws" to describe the errors being thrown.

function one_hot_encode(Xdf::DataFrame; cols_to_encode, drop_first::Bool = true)::Matrix{Float64}
n = nrow(Xdf)
cols = Vector{Vector{Float64}}()
push!(cols, ones(Float64, n)) #Add a column of ones for the intercept term in the design matrix.
encode_names = Set(c isa Int ? Symbol(names(Xdf)[c]) : Symbol(c) for c in cols_to_encode)


for name in names(Xdf) #Selecting columns that aren't the target variable and pushing them to the columns.
col = Xdf[!, name]

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe move this inside the first if statement on line 75

name_sym = Symbol(name)
if name_sym in encode_names
scol = string.(col) # Convert to string for categorical processing.
lv = unique(scol) #Get unique category levels.
ind = scol .== permutedims(lv) #Create indicator matrix for each level of the categorical variable.
#Permutedims is used to align the dimensions for broadcasting.
#Broadcasting compares each element of `scol` with each level in `lv`, resulting in a matrix where each column corresponds to a level and contains `true` for rows that match that level and `false` otherwise.

if drop_first && size(ind, 2) > 1 #Drop the first column of the indicator matrix to avoid multicollinearity if drop_first is true and there are multiple levels.
ind = ind[:, 2:end]
end

for j in 1:size(ind, 2)
push!(cols, Float64.(ind[:, j])) #Convert the boolean indicator columns to Float64 and add them to the list of columns.
end
else
eltype(col) <: Real ||
throw(ArgumentError("Column $name must be numeric unless it is listed in cols_to_encode"))

push!(cols, Float64.(col))
end
end

p = length(cols)
X = Matrix{Float64}(undef, n, p)
for j in 1:p
X[:, j] = cols[j]
end

return Matrix{Float64}(X)

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should have an intercept column (column of 1s) prepended to X. I would do this higher up. Probably around Line 68


end
"""
load_csv_dataset(path_or_url; target_col, name="csv_dataset")
Comment thread
etontackett marked this conversation as resolved.

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Signatures should include types as you have done previously

Comment thread
etontackett marked this conversation as resolved.

Load a dataset from a CSV file or URL.

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should indicate that this does preprocessing as it removes rows with missing values.


# Arguments
- `path_or_url::String`: Local file path or web URL containing CSV data.

# Keyword Arguments
- `cols_to_encode=Symbol[]`: Column names or indices in the feature data to one-hot encode.
- `target_col`: Column index or column name containing the response variable.
- `name::String="csv_dataset"`: Dataset name.

# Returns
- `Dataset`: A dataset containing the encoded feature matrix `X`, response vector `y`, and dataset name.
"""
function load_csv_dataset(path_or_url::String; cols_to_encode=Symbol[], target_col, name::String = "csv_dataset")
Comment thread
etontackett marked this conversation as resolved.

filepath =
startswith(path_or_url, "http") ?
Downloads.download(path_or_url) :
path_or_url

df = DataFrame(CSV.File(filepath)) #Read CSV file into a DataFrame.
df = dropmissing(df) #Remove rows with missing values.
Xdf = select(df, DataFrames.Not(target_col)) #Select all columns except the target column for features.

y = target_col isa Int ?
df[:, target_col] : #If target_col is an integer, use it as a column index to extract the target variable from the DataFrame.
df[:, Symbol(target_col)] #Extract the target variable based on whether target_col is an index or a name.


feature_names = names(Xdf)
encode_cols = [c isa Int ? Symbol(names(Xdf)[c]) : Symbol(c) for c in cols_to_encode]
X = one_hot_encode(Xdf; cols_to_encode=encode_cols, drop_first = true)
Comment thread
etontackett marked this conversation as resolved.


return Dataset(name, X, collect(Float64, y))
end
6 changes: 5 additions & 1 deletion test/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,8 @@
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"

[compat]
CSV = "0.10"
DataFrames = "1"
16 changes: 15 additions & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,20 @@
using RidgeRegression
using Test
using DataFrames
using LinearAlgebra
using CSV

@testset "RidgeRegression.jl" begin
# Write your tests here.
@testset "Dataset Tests" begin
include("src/units/units_dataset_tests.jl")
end

@testset "One-Hot Encoding Tests" begin
include("src/units/units_encoding_tests.jl")
end

@testset "Load CSV Dataset Tests" begin
include("src/units/units_load_csv_dataset_tests.jl")
end

end
19 changes: 19 additions & 0 deletions test/src/units/units_dataset_tests.jl
Comment thread
etontackett marked this conversation as resolved.

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Individual test files should be wrapped as their own modules.

Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
@testset "Dataset constructor stores fields correctly" begin
X = [1 2; 3 4]
y = [10, 20]
d = Dataset("toy", X, y)

@test "toy" == d.name
@test X == d.X
@test y == d.y
@test (2, 2) == size(d.X)
@test 2 == length(d.y)
@test 1.0 == d.X[1, 1]
@test 20.0 == d.y[2]
end

@testset "Dataset constructor throws error for mismatched dimensions" begin
X = [1 2; 3 4]

@test_throws ArgumentError Dataset("bad", X, [1, 2, 3])
end
38 changes: 38 additions & 0 deletions test/src/units/units_encoding_tests.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
@testset "one_hot_encode encodes specified categorical columns and keeps numeric columns" begin
df = DataFrame(
A = ["red", "blue", "red", "green"],
B = [1, 2, 3, 4],
C = ["small", "large", "medium", "small"]
)

X = one_hot_encode(df; cols_to_encode=[:A, :C], drop_first=true)

@test (4, 5) == size(X)
@test [1.0, 2.0, 3.0, 4.0] == X[:, 3]
@test all(x -> x == 0.0 || x == 1.0, X[:, [1, 2, 4, 5]])
@test all(vec(sum(X[:, 1:2]; dims=2)) .<= 1)
@test all(vec(sum(X[:, 4:5]; dims=2)) .<= 1)
end

@testset "one_hot_encode throws error for invalid column specifications" begin
df = DataFrame(
A = ["red", "blue", "red", "green"],
B = [1, 2, 3, 4],
C = ["small", "large", "medium", "small"]
)

@test_throws ArgumentError one_hot_encode(df; cols_to_encode=[:A], drop_first=true)
end

@testset "one_hot_encode supports integer-coded categorical columns when specified" begin
df = DataFrame(
group = [1, 2, 1, 3],
x = [10.0, 20.0, 30.0, 40.0]
)

X = one_hot_encode(df; cols_to_encode=[:group], drop_first=true)

@test (4, 3) == size(X)
@test [10.0, 20.0, 30.0, 40.0] == X[:, 3]
@test all(x -> x == 0.0 || x == 1.0, X[:, 1:2])
end
38 changes: 38 additions & 0 deletions test/src/units/units_load_csv_dataset_tests.jl

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where do you test for missing values?

Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
@testset "load_csv_dataset drops missing rows and uses target column" begin
tmp = tempname() * ".csv"

df = DataFrame(
a = [1.0, 2.0, missing, 4.0],
b = ["x", "y", "y", "x"],
y = [10.0, 20.0, 30.0, 40.0]
)

CSV.write(tmp, df)

d = load_csv_dataset(tmp; target_col=:y, cols_to_encode=[:b], name="tmp")

@test "tmp" == d.name
@test 3 == length(d.y)
@test 3 == size(d.X, 1)
@test [10.0, 20.0, 40.0] == d.y
@test (3, 2) == size(d.X)
end

@testset "load_csv_dataset drops missing rows and uses target column by index" begin
tmp = tempname() * ".csv"

df = DataFrame(
a = [1.0, 2.0, missing, 4.0],
b = ["x", "y", "y", "x"],
y = [10.0, 20.0, 30.0, 40.0]
)

CSV.write(tmp, df)

d = load_csv_dataset(tmp; target_col=3, cols_to_encode=[:b], name="tmp2")

@test "tmp2" == d.name
@test [10.0, 20.0, 40.0] == d.y
@test 3 == size(d.X, 1)
@test (3, 2) == size(d.X)
end
Loading