diff --git a/.Rbuildignore b/.Rbuildignore index f43a20cd..61e99917 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -12,3 +12,5 @@ NEWS.rmd README.rmd ^data-raw$ cran-comments.md +^\.positai$ +^\.claude$ diff --git a/.gitignore b/.gitignore index c3723c95..5893fc52 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ Meta /doc/ /Meta/ LICENSE.md +.positai diff --git a/DESCRIPTION b/DESCRIPTION index 753539b4..ff82626d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: BioMonTools Type: Package Title: Biomonitoring and Bioassessment Calculations -Version: 1.2.4.9012 +Version: 1.2.4.9013 Authors@R: c( person("Erik W.", "Leppo", email="Erik.Leppo@tetratech.com", diff --git a/NAMESPACE b/NAMESPACE index e58d646f..0072ba9f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,9 +15,11 @@ export(metvalgrpxl) export(qc.checks) export(qc_taxa) export(qc_taxa_match_official) -export(qc_taxa_values_ffg) -export(qc_taxa_values_habit) -export(qc_taxa_values_tolval) +export(qc_taxa_names_proof) +export(qc_taxa_phylo) +export(qc_taxa_values_char) +export(qc_taxa_values_logical) +export(qc_taxa_values_numeric) export(rarify) export(taxa_translate) importFrom(rlang,.data) diff --git a/NEWS b/NEWS index 63b528f7..406fdaa3 100644 --- a/NEWS +++ b/NEWS @@ -4,10 +4,22 @@ NEWS - #> Last Update: 2026-03-18 22:51:30.206983 + #> Last Update: 2026-05-11 15:57:28.384427 # Version History +## Changes in version 1.2.4.9013 (2026-05-11) + +- refactor: Rename qc_taxa_match_official to qc_taxa_names_match +- feature: Add qc_taxa_names_proof function + - Check for spelling issues +- feature: Add qc_taxa_phylo for checking master taxa list names +- refactor: Change names from distinct to parent in + qc_taxa_phylo\$unique_parent output +- data: Create example final id issues dataset +- docs: Document finalid_issues data +- fix: Fix metric x_Obs_CatoRhin in metric.values, Issue \#138 + ## Changes in version 1.2.4.9012 (2026-03-18) - feature: Add qc_taxa_values_tolval function diff --git a/NEWS.md b/NEWS.md index 63b528f7..406fdaa3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,10 +4,22 @@ NEWS - #> Last Update: 2026-03-18 22:51:30.206983 + #> Last Update: 2026-05-11 15:57:28.384427 # Version History +## Changes in version 1.2.4.9013 (2026-05-11) + +- refactor: Rename qc_taxa_match_official to qc_taxa_names_match +- feature: Add qc_taxa_names_proof function + - Check for spelling issues +- feature: Add qc_taxa_phylo for checking master taxa list names +- refactor: Change names from distinct to parent in + qc_taxa_phylo\$unique_parent output +- data: Create example final id issues dataset +- docs: Document finalid_issues data +- fix: Fix metric x_Obs_CatoRhin in metric.values, Issue \#138 + ## Changes in version 1.2.4.9012 (2026-03-18) - feature: Add qc_taxa_values_tolval function diff --git a/NEWS.rmd b/NEWS.rmd index 902fa8d8..d6277861 100644 --- a/NEWS.rmd +++ b/NEWS.rmd @@ -18,6 +18,17 @@ cat(paste0("Last Update: ",Sys.time())) # Version History +## Changes in version 1.2.4.9013 (2026-05-11) + +* refactor: Rename qc_taxa_match_official to qc_taxa_names_match +* feature: Add qc_taxa_names_proof function + + Check for spelling issues +* feature: Add qc_taxa_phylo for checking master taxa list names +* refactor: Change names from distinct to parent in qc_taxa_phylo$unique_parent output +* data: Create example final id issues dataset +* docs: Document finalid_issues data +* fix: Fix metric x_Obs_CatoRhin in metric.values, Issue #138 + ## Changes in version 1.2.4.9012 (2026-03-18) * feature: Add qc_taxa_values_tolval function diff --git a/R/data.R b/R/data.R index a1afea74..4aa416ca 100644 --- a/R/data.R +++ b/R/data.R @@ -750,3 +750,37 @@ #' } #' @source example data "data_mmi_dev_small" +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# "data_taxa_names_issues" ---- +#' \name{data_taxa_names_issues} +#' \alias{data_taxa_names_issues} +#' \docType{data} +#' \title{ +#' A Capitalized Title for the Data Set +#' } +# \description{ +#' %% ~~ A concise (1-5 lines) description of the dataset. ~~ +#' } +#' \usage{data("data_taxa_names_issues")} +#' \format{ +#' A data frame with 215 observations on the following 3 variables. +#' \describe{ +#' \item{\code{Source}}{a character vector} +#' \item{\code{Issue}}{a character vector} +#' \item{\code{FinalID}}{a character vector} +#' } +#' } +#' \details{ +#' %% ~~ If necessary, more details than the __description__ above ~~ +#' } +#' \source{ +#' %% ~~ reference to a publication or URL from which the data were obtained ~~ +#' } +#' \references{ +#' %% ~~ possibly secondary sources and usages ~~ +#' } +#' \examples{ +#' data(data_taxa_names_issues) +#' ## maybe str(data_taxa_names_issues) ; plot(data_taxa_names_issues) ... +#' } +#' \keyword{datasets} diff --git a/R/metric_values.R b/R/metric_values.R index ea866b78..1830cbdb 100644 --- a/R/metric_values.R +++ b/R/metric_values.R @@ -1244,6 +1244,7 @@ metric.values.bugs <- function(myDF message(msg) }## IF ~ verbose + ## Logical ---- # Logical Columns to Logical # Ensure in correct format, Access converts sometimes to 0, -1 # 2025-06-13 @@ -1260,6 +1261,7 @@ metric.values.bugs <- function(myDF myDF[, i] <- as.logical(myDF[, i]) }## FOR ~ i ~ logical + ## NonTarget ---- # Remove NonTarget Taxa (added back 20200715, missing since 20200224) # Function fails if all NA (e.g., column was missing) (20200724) if (verbose == TRUE) { @@ -1283,6 +1285,7 @@ metric.values.bugs <- function(myDF myDF <- dplyr::filter(myDF, NONTARGET != TRUE | is.na(NONTARGET)) + ## ColNames to Upper ---- # # Convert columns to upper case (Phylo, FFG, Habit, Life_Cycle) if (verbose == TRUE) { debug_topic <- "Munging, text cols, toupper" @@ -1334,7 +1337,7 @@ metric.values.bugs <- function(myDF }## IF ~ verbose - + ## White Space ---- # Remove white space myDF[, "HABIT"] <- gsub(" ","", myDF[, "HABIT"]) myDF[, "FFG"] <- gsub(" ","", myDF[, "FFG"]) @@ -1345,6 +1348,7 @@ metric.values.bugs <- function(myDF myDF[, "ELEVATION_ATTR"] <- gsub(" ","", myDF[, "ELEVATION_ATTR"]) myDF[, "GRADIENT_ATTR"] <- gsub(" ","", myDF[, "GRADIENT_ATTR"]) myDF[, "WSAREA_ATTR"] <- gsub(" ","", myDF[, "WSAREA_ATTR"]) + ## Helper Cols ---- # code new columns ## match, any myDF[, "HABIT_BU"] <- grepl("BU", myDF[, "HABIT"]) @@ -1366,6 +1370,7 @@ metric.values.bugs <- function(myDF myDF[, "LC_MULTI"] <- grepl("MULTI", myDF[, "LIFE_CYCLE"]) myDF[, "LC_SEMI"] <- grepl("SEMI", myDF[, "LIFE_CYCLE"]) myDF[, "LC_UNI"] <- grepl("UNI", myDF[, "LIFE_CYCLE"]) + myDF[, "FFG2_DD"] <- grepl("DD", myDF[, "FFG2"]) myDF[, "FFG2_PRE"] <- grepl("PR", myDF[, "FFG2"]) myDF[, "TI_STENOCOLD"] <- grepl("STENOC", myDF[, "THERMAL_INDICATOR"]) myDF[, "TI_COLD"] <- grepl("COLD", myDF[, "THERMAL_INDICATOR"]) @@ -1447,7 +1452,7 @@ metric.values.bugs <- function(myDF # filter(row_number()<=5) - # Create Dominant N #### + ## Dominant N ---- # Create df for Top N (without ties) if (verbose == TRUE) { debug_topic <- "Munging, Dom" @@ -2595,12 +2600,19 @@ metric.values.bugs <- function(myDF ### FFG2 #### # marine ## nt_ffg2 + , nt_ffg2_deepdep = dplyr::n_distinct(TAXAID[EXCLUDE != TRUE + & FFG2_DD == TRUE] + , na.rm = TRUE) , nt_ffg2_intface = NA , nt_ffg2_subsurf = NA ## pi_ffg2 + , pi_ffg2_deepdep = 100 * sum(N_TAXA[FFG2_DD == TRUE] + , na.rm = TRUE) / ni_total , pi_ffg2_scavburr = NA ## pt_ffg2 - # = conveyorbelt, interface, scavengerbrowser, subsurface, watercolumn, predator + , pi_ffg2_deepdep = 100 * nt_ffg2_deepdep / nt_total + # = conveyorbelt, interface, scavengerbrowser, subsurface, + # watercolumn, predator ### Habit #### #(need to be wild card. that is, counts both CN,CB and CB as climber) @@ -2762,6 +2774,7 @@ metric.values.bugs <- function(myDF ### Density #### # Numbers per area sampled + , ni_m2 = NA ### Estuary-Marine #### # Mixed in with other metrics @@ -5156,12 +5169,14 @@ metric.values.fish <- function(myDF # OR are really different and probably only applicable # to a specific entity #### Boise BCG - , x_Obs_CatoRhin <- as.integer(min(1, dplyr::n_distinct(TAXAID[EXCLUDE != TRUE + , x_Obs_CatoRhin = as.integer(min(1, dplyr::n_distinct(TAXAID[EXCLUDE != TRUE & GENUS == "RHINICHTHYS"] - , na.rm = TRUE)) + + , na.rm = TRUE), + na.rm = TRUE) + min(1, dplyr::n_distinct(TAXAID[EXCLUDE != TRUE & FAMILY == "CATOSTOMIDAE"] - , na.rm = TRUE))) + , na.rm = TRUE), + na.rm = TRUE)) #### New Mexico Fish BCG , nt_piscivore_BCG_att66s6t = dplyr::n_distinct(TAXAID[EXCLUDE != TRUE & TROPHIC_PI == TRUE diff --git a/R/qc_taxa_match_official.R b/R/qc_taxa_names_match.R similarity index 100% rename from R/qc_taxa_match_official.R rename to R/qc_taxa_names_match.R diff --git a/R/qc_taxa_names_proof.R b/R/qc_taxa_names_proof.R new file mode 100644 index 00000000..b4e92c37 --- /dev/null +++ b/R/qc_taxa_names_proof.R @@ -0,0 +1,297 @@ +#' @title QC Taxa List Proofreading +#' +#' @description Performs basic proofreading of names in a taxa list. +#' +#' @details Returns possible differences in a data frame with three columns (qc check, +#' name, potential match(es)). Not all hits are errors but are potential issues +#' that may need to be addressed. +#' +#' The distance check Computes pairwise string distances between names and +#' returns name pairs that are likely duplicates. +#' +#' Uses Jaro-Winkler (jw) distance by default which performs well for names. +#' Other options are Levenshtein (lv), good for typos, and osa, like Levenshtein +#' but slightly faster. +#' +#' Good thresholds are jw 0.1 to 0.2, lv and osa <= 2 +#' +#' The checks include: +#' +#' * **spaces**, leading or trailing, including html white space, or doulble +#' space, or more than 3 +#' +#' * **case**, differences +#' +#' * **sp** variants; (with/without .) sp and spp, inside next to slash +#' +#' * **stage** variants; adult, A, pupa, pupae, P, immature, I, imm, juv, +#' juvenile, larva, larvae, L, zoea, myses, mysops? +#' +#' * **probably**, variants; "?", " prob ", " prob. ", " probably " +#' * add parentheses +#' +#' * **cf**, variants start, or in string, cf, c.f., cf., c.f +#' +#' * backslash_dash_underscore +#' +#' complex cmplx +#' +#' all caps +#' +#' and, & +#' +#' star +#' +#' head +#' +#' possibly, poss, poss. +#' +#' +#' unknown unk undetermined undet(.), indet, indetermined +#' +#' large small with space or parentheses +#' +#' backslash_dash +#' +#' * **slash, direction** direction; including dash +#' +#' * **slash, taxa** x/y vs. y/x +#' +#' * **grp** variants; grp, gr, group, (with/without .) and without +#' and dash and genus group, gp, dash or space before +#' +#' * **unid** variants; unid, unidentified, unid diff, uid, (with/without .) +#' +#' diff without unid +#' +#' * **prob** variants; prob, prob., probably, including "?" (anywhere in text) +#' +#' * **sensu** +#' +#' * **parenthetical** text; sensu, prob, inc spec, (with/without .) +#' +#' * **near** variants; nr n +#' +#' aff. , f +#' flag +#' +#' quotes +#' +#' slash order; c/o vs o/c +#' +#' with, without, w/, w/o, w/ o, w /, w / o +#' +#' frag and fragment +#' +#' Tubificid +#' +#' * **colon** e.g., Family: Genus +#' +#' * **patterns** tera$ in Order, idae$ in Family, inae$ Subfamily, and +#' ini$ in Tribe. Look for those patterns not in the expected columns. +#' would need the entire taxa table. Right now only looking at a single vector. +#' +#' immature, imm, w/ and w/o hair chaetae, hair+pectinate, bifid +#' setae, chaetae +#' +#' Common authors not in parentheses, e.g., Epler +#' +#' +#' text mining algorithms (word similarity) +#' Other checks caught: +#' +#' some not included: +#' +#' * f. = forma = valid +#' +#' @param names A character vector containing taxa name data. +#' @param method String distance method (passed to stringdist). +#' Default = "jw" +#' @param max_distance Numeric threshold for similarity. Default = 0.13 +#' +#' @return A data frame with col_tolval values, occurrence (n), and if valid +#' (TRUE/FALSE). +#' +#' @examples +#' # Example Issues +#' proof_issues <- qc_taxa_names_proof(data_taxa_names_issues$FinalID) +#' proof_issues$issues +#' lapply(proof_issues, nrow) +#' +#' +#' # Example Master Taxa Lists +#' proof_MBSS <- qc_taxa_names_proof(data_benthos_MBSS$TAXAID, "jw", 0.13) +#' proof_MBSS$issues +#' head(proof_MBSS$distance) +#' +#' proof_PacNW_taxaid <- qc_taxa_names_proof(data_benthos_PacNW$TaxaID) +#' proof_PacNW_taxaid$issues +#' head(proof_PacNW_taxaid$distance) +#' +#' proof_PacNW_master_taxaid <- qc_taxa_names_proof(TaxaMaster_Ben_BCG_PacNW$TaxaID) +#' proof_PacNW_master_taxaid$issues +#' proof_PacNW_master_taxaid$stage +#' head(proof_PacNW_taxaid$distance) +#' +#' @export +qc_taxa_names_proof <- function(names, + method = "jw", + max_distance = 0.13) { + + boo_debug <- FALSE + if (boo_debug) { + names = BioMonTools::data_benthos_MBSS$TAXAID + # names = BioMonTools::data_benthos_PacNW$TaxaID + # names = BioMonTools::TaxaMaster_Ben_BCG_PacNW$TaxaID + method = "jw" + max_distance = 0.13 + }## boo_debug + + # Suppress empty tibbles + # @title Drop empty data frames from output + # @description + # Returns NULL instead of printing a zero-row tibble. + # @param x A data frame or tibble. + # @return The data frame, or NULL if it has zero rows. + drop_if_empty <- function(x) { + if (nrow(x) == 0) NULL else x + }## FUNCTION ~ drop_if_empty + + + # unique---- + # avoid duplicates in output + names <- unique(names) + + # 01. distance ---- + df_distance <- data.frame(name = names) |> + # generate all pairwise combinations + # join of id < id enforces unique pairs + dplyr::mutate(id = dplyr::row_number()) |> + dplyr::inner_join( + y = data.frame(name = names) |> + dplyr::mutate(id = dplyr::row_number()), + by = dplyr::join_by(id < id), + suffix = c("_x", "_y")) |> + # calc distance + dplyr::mutate( + distance = stringdist::stringdist( + name_x, + name_y, + method = method)) |> + # filter + dplyr::filter(distance <= max_distance) |> + # sort + dplyr::arrange(distance) |> + # filter if blank + drop_if_empty() + + # 02. space ---- + # beginning or ending white space + # including non-breaking ws (html $nbsp), unicode \u00A0 + pat_space <- "^(?:\\s|\\u00A0)+|(?:\\s|\\u00A0)+$" + + df_space <- names[grepl(pat_space, + names, + ignore.case = TRUE, + perl = FALSE)] + + + # 03. case ---- + df_case <- data.frame(name_orig = names) |> + dplyr::mutate(name_lower = tolower(name_orig)) |> + dplyr::group_by(name_lower) |> + dplyr::filter(dplyr::n() > 1) |> + dplyr::summarise(count = dplyr::n(), + original_names = paste(unique(name_orig), collapse = ", "), + .groups = "drop") |> + # filter if blank + drop_if_empty() + + # 04. sp ---- + # " sp ", " sp. ", " spp ", and " spp. " + # pat_sp <- "(?<=\\s)sp{1,2}\\.?(?=\\s)" + # # sp and spp + # # dot and not dot + # + # df_sp <- names[grepl(pat_sp, + # names, + # ignore.case = TRUE)] + + + # 05. stage ---- + # regex pattern at end after a space + # negative look ahead for " sp. A" and " sp A" + pat_stage <- "(? 0 to "issues" + results$issues <- names(results)[vapply(results, length, integer(1)) > 0] + + # Result---- + return(results) + +}## FUNCTION ~ END + diff --git a/R/qc_taxa_phylo.R b/R/qc_taxa_phylo.R new file mode 100644 index 00000000..2b6eefbe --- /dev/null +++ b/R/qc_taxa_phylo.R @@ -0,0 +1,236 @@ +#' @title QC Taxa Phylo +#' +#' @description Performs basic quality control on a phylogenetic list. +#' +#' @details Returns a list of with elements corresponding the various checks on +#' a phylogenetic (standard or master) taxa list. +#' +#' The phylogenetic list is multiple columns that the user will provide in rank +#' order from coarse to fine along with a FinalID column. +#' +#' The checks are listed below and only report the entries that fail. +#' +#' Some checks will detect those with potential issues but others that are +#' valid. +#' +#' * **unique_parent** Each taxonomic rank (child) has a unique parent (coarser rank). +#' Parents include all coarser ranks (as defined by user). +#' +#' * **phylo_unique_rank** Each name is in only one phylogenetic rank column +#' +#' * **phylo_as_finalid** Each phylogenetic name is also in FinalID +#' +#' * **finalid_as_phylo** Each final id is a phylogenetic name +#' +#' others checks? +#' +#' case (all lower, all upper) +#' +#' spaces +#' ? +#' non A-Z (any case), e.g., slash, dash, underscore, parentheses, etc. +#' +#' NEED TRIGGER FOR ignore.case, default to FALSE +#' +#' If ignore_case is TRUE then all columns (finalid and phylo_names) will be +#' converted to upper case before checks are performed. +#' +#' +#' @param data A data frame +#' @param finalid Column name for FinalID. Default = "FinalID" +#' @param phylo_names Vector of phylogenetic names in order from coarse to fine. +#' Default = c("Phylum", "Subphylum", "Class", "Subclass", "Order", "Suborder", +#' "Family", "Subfamily", "Tribe", "Genus") +#' @param ignore_case Should case be ignore for checks. Default = FALSE. +#' +#' @return A list elements for each qc check. +#' +#' @examples +#' qc_phylo_PacNW <- qc_taxa_phylo(TaxaMaster_Ben_BCG_PacNW, +#' "TaxaID", +#' phylo_names = c("Phylum", +#' "SubPhylum", +#' "Class", +#' "SubClass", +#' "Order", +#' "SuperFamily", +#' "Family", +#' "Tribe", +#' "Genus", +#' "SubGenus", +#' "Species")) +#' qc_phylo_PacNW$issues +#' qc_phylo_PacNW$unique_parent +#' qc_phylo_PacNW$phylo_unique_rank +#' qc_phylo_PacNW$phylo_as_finalid +#' qc_phylo_PacNW$finalid_as_phylo +#' +#' @export +qc_taxa_phylo <- function(data, + finalid = "FinalID", + phylo_names = c("Phylum", + "Subphylum", + "Class", + "Subclass", + "Order", + "Suborder", + "Family", + "Subfamily", + "Tribe", + "Genus"), + ignore_case = FALSE) { + + boo_debug <- FALSE + if (boo_debug) { + data = BioMonTools::TaxaMaster_Ben_BCG_PacNW + finalid = "TaxaID" + phylo_names = c("Phylum", + "SubPhylum", + "Class", + "SubClass", + "Order", + "SuperFamily", + "Family", + "Tribe", + "Genus", + "SubGenus", + "Species") + }## boo_debug + + # Suppress empty tibbles + # @title Drop empty data frames from output + # @description + # Returns NULL instead of printing a zero-row tibble. + # @param x A data frame or tibble. + # @return The data frame, or NULL if it has zero rows. + drop_if_empty <- function(x) { + if (nrow(x) == 0) NULL else x + }## FUNCTION ~ drop_if_empty + + # QC ---- + # check for any phylo_names not in data + stopifnot(all(phylo_names %in% names(data))) + + # 00. Data Prep---- + cols_phylo <- phylo_names + + # 01. unique_parent ---- + # run as a list since number of "phylo_name"s unknown + + # need extra loop if do all combinatins + + # get all combinations of parent and child + len_cols_phylo <- length(cols_phylo) + df_pairs <- data.frame( + col_num_child = rep(2:len_cols_phylo, times = 1:(len_cols_phylo-1)), + col_num_parent = unlist(lapply(2:len_cols_phylo, + function(i) (i - 1):1))) + df_pairs$phylo_child <- cols_phylo[df_pairs$col_num_child] + df_pairs$phylo_parent <- cols_phylo[df_pairs$col_num_parent] + df_pairs$check_name <- paste(df_pairs$phylo_child, + df_pairs$phylo_parent, + sep = "_") + + # create results list + i_list <- vector("list", nrow(df_pairs)) + names(i_list) <- df_pairs$check_name + + for (i in seq_len(nrow(df_pairs))) { + # message(paste0(i, "; ", df_pairs[i, "check_name"])) + + i_list[[i]] <- data |> + # rename child column + dplyr::rename(child_name = dplyr::all_of(df_pairs[i, "phylo_child"])) |> + # add ranks + dplyr::mutate(child_rank = df_pairs[i, "phylo_child"]) |> + dplyr::mutate(parent_rank = df_pairs[i, "phylo_parent"]) |> + # calc + dplyr::group_by(child_name, child_rank, parent_rank) |> + dplyr::filter(!is.na(child_name)) |> + dplyr::summarize(parent_n = dplyr::n_distinct(.data[[df_pairs[i, "phylo_parent"]]]), + parent_names = paste(sort(unique(.data[[df_pairs[i, "phylo_parent"]]])), + collapse = ", "), + .groups = "drop") + }## FOR ~ i + + df_unique_parent <- i_list |> + # combine element of list + # source is target_rank + dplyr::bind_rows() |> + # filter + dplyr::filter(parent_n > 1) |> + # filter if blank + drop_if_empty() + + + # 03. phylo_as_finalid ---- + df_phylo_as_finalid <- data |> + # cols to keep + dplyr::select(dplyr::all_of(finalid), dplyr::all_of(cols_phylo)) |> + # pivot + tidyr::pivot_longer(cols = dplyr::all_of(cols_phylo), + names_to = "phylo_level", + values_to = "phylo_name") |> + # non blank + dplyr::filter(!is.na(phylo_name)) |> + # add count by phylo_level + dplyr::add_count(phylo_name, name = "n_phylo_name") |> + # unique + dplyr::distinct(.data[[finalid]], + phylo_name, + phylo_level, + n_phylo_name) |> + # match final id + dplyr::mutate(match_finalid = phylo_name %in% data[, finalid]) |> + # cols 2 keep + dplyr::select(phylo_name, phylo_level, n_phylo_name, match_finalid) |> + # unique + dplyr::distinct(phylo_name, phylo_level, n_phylo_name, match_finalid) |> + # find issues, false and not blank + dplyr::filter(match_finalid == FALSE, + phylo_name != "") |> + # add factor for sorting + dplyr::mutate(phylo_level = factor(phylo_level, + levels = cols_phylo)) |> + # sort + dplyr::arrange(phylo_level, phylo_name) + + # 02. phylo_unique_rank ---- + # duplicate phylo names + ## appearing in more than one phylo column + # df_phylo_unique_rank <- df_match_phylo_final |> + df_phylo_unique_rank <- df_phylo_as_finalid |> + # add count of duplicate names + dplyr::add_count(phylo_name, name = "num_phylo_col") |> + # filter for dups + dplyr::filter(num_phylo_col > 1) |> + # sort + dplyr::arrange(phylo_name, phylo_level) |> + # filter if blank + drop_if_empty() + + # 04. finalid_as_phylo ---- + df_finalid_as_phylo <- data |> + dplyr::mutate(match_phylo = rowSums(dplyr::across(all_of(cols_phylo), + ~ .x == .data[[finalid]]), + na.rm = TRUE) > 0) |> + dplyr::select(dplyr::all_of(finalid), match_phylo) |> + dplyr::filter(match_phylo == FALSE) + + + + # Combine ---- + results <- list( + "issues" = NULL, + "unique_parent" = df_unique_parent, + "phylo_unique_rank" = df_phylo_unique_rank, + "phylo_as_finalid" = df_phylo_as_finalid, + "finalid_as_phylo" = df_finalid_as_phylo) + # report names of elements with length > 0 to "issues" + results$issues <- names(results)[vapply(results, length, integer(1)) > 0] + + # Result---- + return(results) + +}## FUNCTION ~ END + diff --git a/R/qc_taxa_values_character.R b/R/qc_taxa_values_character.R new file mode 100644 index 00000000..d34b7544 --- /dev/null +++ b/R/qc_taxa_values_character.R @@ -0,0 +1,130 @@ +#' QC Autecological Character Values +#' +#' Performs basic QC of a character column against a list of accepted values. +#' +#' Returns a data frame the values from the input with counts (column = n) from +#' the column and whether the values appeared in valid values (column = +#' valid). Values in the accepted values not appearing in the input are appended +#' to the bottom of the returned data frame. These values are marked as n = NA +#' and valid = TRUE. If NA is a valid value it must be included in valid_vals +#' or in the output NA will be labeled as valid = FALSE. +#' +#' The default accepted values are the abbreviations are those used as +#' metric.values(). +#' +#' For Function Feeding Group (FFG); CF, CG, MH, OM, PA, PI, PR, SC, SH, and XY. User using FC +#' and GC over CF and CG can modify the accepted values. Both versions are +#' accepted in metric.values(). +#' +#' For Habit; BU, CB, CN, SK, SP, and SW. Valid separated with "," are +#' first split apart and spaces removed. +#' +#' life cycle +#' +#' bcg_attr +#' +#' habitat, habitat structure, elevation, gradient, thermal +#' +#' @param data A data frame containing autecological taxa data. +#' @param col_char The column containing the character values to be checked. +#' @param valid_vals Accepted values. +#' @param separator If values should be separated and checked include a +#' delimiter. Default = NULL +#' +#' @return A data frame with col_char values, occurrence (n), and if valid (TRUE/ +#' FALSE). Any missing valid_vals are appended. +#' +#' @examples +#' Values, FFG, Abr +#' qc_taxa_values_char(data_benthos_PacNW, +#' "FFG", +#' valid_vals = c("CF", +#' "CG", +#' "MH", +#' "OM", +#' "PA", +#' "PH", +#' "PI", +#' "PR", +#' "SC", +#' "SH", +#' "XY", +#' NA)) +#' +#' # Values, FFG, full names +#' qc_taxa_values_char(data_benthos_MBSS, +#' "FFG", +#' valid_vals = c("Collector", +#' "Filterer", +#' "Predator", +#' "Scraper", +#' "Shredder")) +#' +#' # Values, Habit, no separator +#' qc_taxa_values_char(data_benthos_MBSS, +#' "Habit", +#' valid_vals = c("bu", "cb", "cn", "dv", "sk", "sp", "sw")) +#' +#' # Values, Habit, no separator +#' qc_taxa_values_char(data_benthos_MBSS, +#' "Habit", +#' valid_vals = c("bu", "cb", "cn", "dv", "sk", "sp", "sw"), +#' separator = ",") +#' +#' +#' +#' @export +qc_taxa_values_char <- function(data, + col_char = NULL, + valid_vals = NULL, + separator = NULL) { + + # QC---- + # Check for values + # stopifnot(!is.null(data)) + stopifnot(!is.null(col_char)) + stopifnot(!is.null(valid_vals)) + # Check for col_char in data + if (!rlang::as_string(col_char) %in% names(data)) { + stop("Column '", + rlang::as_string(col_char), + "' is missing from input data.", call. = FALSE) + }# IF ~ col_char + + # Convert valid_vals to data frame + df_valid_vals <- as.data.frame(valid_vals) + names(df_valid_vals) <- col_char + + # occurrence---- + ## separator ---- + # parse column first (remove whitespace) + # track original columns? + if (is.null(separator)) { + + }## IF ~ separator + + # as is, no separator + df_result <- data |> + # occurrence + dplyr::count(.data[[col_char]], name = "n") |> + # force valid value rows to exist + # *ERROR*20260511*comment out + # tidyr::complete( + # .data[[col_char]] = c(TRUE, FALSE, NA), + # fill = list(n = 0)) |> + # valid + dplyr::mutate(valid = .data[[col_char]] %in% valid_vals) #|> + # ## values + # dplyr::full_join(y = df_valid_vals, + # by = dplyr::join_by({{col_char}})) |> + # ## convert NA to TRUE + # dplyr::mutate(valid = dplyr::case_when(is.na(valid) ~ TRUE, + # .default = valid)) + + # look at logical, rename then rename back + + # Result---- + return(df_result) + +}## FUNCTION ~ END + diff --git a/R/qc_taxa_values_ffg.R b/R/qc_taxa_values_ffg.R deleted file mode 100644 index cc86a9ef..00000000 --- a/R/qc_taxa_values_ffg.R +++ /dev/null @@ -1,81 +0,0 @@ -#' QC Functional Feeding Group (FFG) Values -#' -#' Performs basic QC of the FFG column against a list of accepted values. -#' -#' Returns a data frame the values from the input with counts (column = n) from -#' the FFG column and whether the value appeared in valid values (column = -#' valid). Values in the accepted values not appearing in the input are appended -#' to the bottom of the returned data frame. These values are marked as n = NA -#' and valid = TRUE. -#' -#' The default accepted values are the abbreviations are those used as -#' metric.values(); CF, CG, MH, OM, PA, PI, PR, SC, SH, and XY. User using FC -#' and GC over CF and CG can modify the accepted values. Both versions are -#' accepted in metric.values(). -#' -#' @param df_data A data frame containing taxa data. -#' @param col_ffg The column containing FFG values. Default = "FFG" -#' @param valid_vals Accepted values. -#' Default = c(CF, CG, MH, OM ,PA, PH, PI, PR, SC, SH, XY) -#' -#' @return A data frame with col_ffg values, occurrence (n), and if valid (TRUE/ -#' FALSE). Additional values from valid_vals are appended. -#' -#' @examples -#' # Values, Default -#' qc_taxa_values_ffg(data_benthos_PacNW) -#' -#' # Values, User (full names) -#' qc_taxa_values_ffg(data_benthos_MBSS, -#' "FFG", -#' valid_vals = c("Collector", -#' "Filterer", -#' "Predator", -#' "Scraper", -#' "Shredder")) -#' -#' @export -qc_taxa_values_ffg <- function(df_data, - col_ffg = "FFG", - valid_vals = c("CF", - "CG", - "MH", - "OM", - "PA", - "PH", - "PI", - "PR", - "SC", - "SH", - "XY")) { - - # QC---- - if (!rlang::as_string(col_ffg) %in% names(df_data)) { - stop("Column '", - rlang::as_string(col_ffg), - "' is missing from input data.", call. = FALSE) - }# IF ~ col_ffg - - # Convert valid_vals to data frame - df_valid_vals <- as.data.frame(valid_vals) - names(df_valid_vals) <- col_ffg - - # occurrence---- - df_match <- df_data |> - # occurrence - dplyr::count(.data[[col_ffg]], name = "n") |> - # valid - ## T/F - dplyr::mutate(valid = .data[[col_ffg]] %in% valid_vals) |> - ## values - dplyr::full_join(y = df_valid_vals, - by = dplyr::join_by({{col_ffg}})) |> - ## convert NA to TRUE - dplyr::mutate(valid = dplyr::case_when(is.na(valid) ~ TRUE, - .default = valid)) - - # Result---- - return(df_match) - -}## FUNCTION ~ END - diff --git a/R/qc_taxa_values_habit.R b/R/qc_taxa_values_habit.R deleted file mode 100644 index 2c9d2ef2..00000000 --- a/R/qc_taxa_values_habit.R +++ /dev/null @@ -1,75 +0,0 @@ -#' QC Habitat Values -#' -#' Performs basic QC of the Habit column against a list of accepted values. -#' -#' Returns a data frame the values from the input with counts (column = n) from -#' the Habit column and whether the value appeared in valid values (column = -#' valid). Values in the accepted values not appearing in the input are appended -#' to the bottom of the returned data frame. These values are marked as n = NA -#' and valid = TRUE. -#' -#' The default accepted values are the abbreviations are those used as -#' metric.values(); BU, CB, CN, SK, SP, and SW. Valid separated with "," are -#' first split apart and spaces removed. -#' -#' @param df_data A data frame containing taxa data. -#' @param col_habit The column containing Habit values. Default = "Habit" -#' @param valid_vals Accepted values. -#' Default = c(BU, CB, CN, SK ,SP, SW.) -#' -#' @return A data frame with col_habit values, occurrence (n), and if valid -#' (TRUE/FALSE). Additional values from valid_vals are appended. -#' -#' @examples -#' # Values, Default -#' qc_taxa_values_habit(data_benthos_MBSS) -#' -#' # Values, User -#' qc_taxa_values_habit(data_benthos_MBSS, -#' "Habit", -#' valid_vals = c("bu", "cb", "cn", "dv", "sk", "sp", "sw")) -#' -#' @export -qc_taxa_values_habit <- function(df_data, - col_habit = "Habit", - valid_vals = c("BU", - "CB", - "CN", - "SK", - "SP", - "SW")) { - - # QC---- - if (!rlang::as_string(col_habit) %in% names(df_data)) { - stop("Column '", - rlang::as_string(col_habit), - "' is missing from input data.", call. = FALSE) - }# IF ~ col_habit - - # Convert valid_vals to data frame - df_valid_vals <- as.data.frame(valid_vals) - names(df_valid_vals) <- col_habit - - # occurrence---- - df_match <- df_data |> - # get all values, split on comma with optional surrounding spaces - tidyr::separate_rows(.data[[col_habit]], sep = "\\s*,\\s*") |> - # remove spaces - dplyr::mutate({{col_habit}} := trimws(.data[[col_habit]])) |> - # occurrence - dplyr::count(.data[[col_habit]], name = "n") |> - # valid - ## T/F - dplyr::mutate(valid = .data[[col_habit]] %in% valid_vals) |> - ## values - dplyr::full_join(y = df_valid_vals, - by = dplyr::join_by({{col_habit}})) |> - ## convert NA to TRUE - dplyr::mutate(valid = dplyr::case_when(is.na(valid) ~ TRUE, - .default = valid)) - - # Result---- - return(df_match) - -}## FUNCTION ~ END - diff --git a/R/qc_taxa_values_logical.R b/R/qc_taxa_values_logical.R new file mode 100644 index 00000000..c8715663 --- /dev/null +++ b/R/qc_taxa_values_logical.R @@ -0,0 +1,69 @@ +#' QC Autecological Logical Values +#' +#' Performs basic QC of a logical column showing occurrence. +#' +#' Returns a data frame the values from the input with counts (column = n) from +#' the column. +#' +#' @param data A data frame containing autecological taxa data. +#' @param col_vals The column containing Tolerance Values. Default = "TolVal" +#' +#' @return A data frame with col_vals values, occurrence (n), and valid +#' (TRUE/FALSE). Missing values (TRUE, FALSE, or NA) are appended. +#' +#' @examples +#' # Exclude +#' qc_taxa_values_logical(data_benthos_MBSS, "EXCLUDE") +#' +#' # NonTarget +#' qc_taxa_values_logical(data_benthos_MBSS, "NONTARGET") +#' +#' @export +qc_taxa_values_logical <- function(data, + col_logical = NULL) { + + # QC---- + # col_logical, missing + if (is.null(col_logical)) { + stop("'col_logical' is missing.", call. = FALSE) + }## IF ~ col_logical is missing + + ## col_logical in data + if (!rlang::as_string(col_logical) %in% names(data)) { + stop("Column '", + rlang::as_string(col_logical), + "' is missing from input data.", call. = FALSE) + }# IF ~ col_logical exists + + ## col_logical is logical + if (!is.logical(data[[rlang::as_string(col_logical)]])) { + stop("Column '", + rlang::as_string(col_logical), + "' must be logical", call. = FALSE) + }## IF ~ col_logical is logical + + + # occurrence---- + df_result <- data |> + # all_of and = not working together in complete + # rename before and after + dplyr::rename(value = !!col_logical) |> + # occurrence + dplyr::count(value, name = "n") |> + # force valid value rows to exist + tidyr::complete( + value = c(TRUE, FALSE, NA), + fill = list(n = 0)) |> + # rename back + dplyr::rename(!!col_logical := value) |> + # valid + dplyr::mutate(valid = dplyr::case_when( + is.na(.data[[col_logical]]) | + .data[[col_logical]] %in% c(TRUE, FALSE)~ TRUE, + .default = FALSE)) + + # Result---- + return(df_result) + +}## FUNCTION ~ END + diff --git a/R/qc_taxa_values_numeric.R b/R/qc_taxa_values_numeric.R new file mode 100644 index 00000000..52c27f71 --- /dev/null +++ b/R/qc_taxa_values_numeric.R @@ -0,0 +1,92 @@ +#' QC Autecological Numeric Values +#' +#' Performs basic QC of a numeric column showing all values. +#' +#' Returns a data frame the values from the input with counts (column = n) from +#' the column. Given valid_min and valid_max are applied to each values and +#' evaluated as valid TRUE or FALSE. +#' +#' The accepted values for TolVal are 0 - 10. +#' +#' The BioMonTools accepted values for UFC are 1 - 6. +#' +#' @param data A data frame containing autecological taxa data. +#' @param col_vals The column containing Tolerance Values. Default = NA. +#' @param valid_min Valid values range minimum (inclusive). Default = NA. +#' @param valid_max Valid values range maximum (inclusive). Default = NA. +#' +#' @return A data frame with col_vals values, occurrence (n), and valid +#' (TRUE/FALSE) within range of valid_min and valid_max. +#' +#' @examples +#' # TolVal +#' qc_taxa_values_numeric(data_benthos_MBSS, "TOLVAL", 0, 10) +#' +#' # TolVal2 +#' qc_taxa_values_numeric(data_benthos_MBSS, "TOLVAL2", 0, 10) +#' +#' # UFC +#' qc_taxa_values_numeric(data_benthos_MBSS, "UFC", 1, 6) +#' +#' @export +qc_taxa_values_numeric <- function(data, + col_numeric = NULL, + valid_min = NULL, + valid_max = NULL) { + + # QC---- + # col_numeric, missing + if (is.null(col_numeric)) { + stop("'col_numeric' is missing.", call. = FALSE) + }## IF ~ col_numeric is missing + + ## col_numeric in data + if (!rlang::as_string(col_numeric) %in% names(data)) { + stop("Column '", + rlang::as_string(col_numeric), + "' is missing from input data.", call. = FALSE) + }# IF ~ col_numeric exists + + ## col_numeric is numeric + if (!is.numeric(data[[rlang::as_string(col_numeric)]])) { + stop("Column '", + rlang::as_string(col_numeric), + "' must be numeric", call. = FALSE) + }## IF ~ col_numeric is numeric + + ## valid_min, missing + if (!is.null(valid_min)) { + stop("'valid_min' is missing.", call. = FALSE) + }## IF ~ valid_min is missing + + ## valid_max, missing + if (!is.null(valid_min)) { + stop("'valid_max' is missing.", call. = FALSE) + }## IF ~ valid_max is missing + + ## valid_min is numeric + if (!is.numeric(valid_min)) { + stop("'valid_min' must be numeric.", call. = FALSE) + }## IF ~ valid_min is numeric + + ## valid_min is numeric + if (!is.numeric(valid_min)) { + stop("'valid_min' must be numeric.", call. = FALSE) + }## IF ~ valid_min is numeric + + # occurrence---- + df_result <- data |> + # occurrence + dplyr::count(.data[[col_numeric]], name = "n") |> + # valid + ## T/F + dplyr::mutate(valid = dplyr::case_when( + .data[[col_numeric]] >= valid_min & + .data[[col_numeric]] <= valid_max ~ TRUE, + .default = FALSE)) + + # Result---- + return(df_result) + +}## FUNCTION ~ END + diff --git a/R/qc_taxa_values_tolval.R b/R/qc_taxa_values_tolval.R deleted file mode 100644 index 0692375d..00000000 --- a/R/qc_taxa_values_tolval.R +++ /dev/null @@ -1,68 +0,0 @@ -#' QC Habitat Values -#' -#' Performs basic QC of the Tolerance Value column. -#' -#' Returns a data frame the values from the input with counts (column = n) from -#' the TolVal column and whether the value appeared in valid values (column = -#' valid). -#' -#' The default accepted values are 0 - 10. -#' -#' @param df_data A data frame containing taxa data. -#' @param col_tolval The column containing Tolerance Values. Default = "TolVal" -#' @param valid_min Valid values range minimum. Default = 0. -#' @param valid_max Valid values range maximum. Default = 10. -#' -#' @return A data frame with col_tolval values, occurrence (n), and if valid -#' (TRUE/FALSE). -#' -#' @examples -#' qc_taxa_values_tolval(data_benthos_MBSS, "TOLVAL") -#' -#' @export -qc_taxa_values_tolval <- function(df_data, - col_tolval = "TolVal", - valid_min = 0, - valid_max = 10) { - - # QC---- - ## col_tolval in df_data - if (!rlang::as_string(col_tolval) %in% names(df_data)) { - stop("Column '", - rlang::as_string(col_tolval), - "' is missing from input data.", call. = FALSE) - }# IF ~ col_tolval exists - - ## col_tolval is numeric - if (!is.numeric(df_data[[rlang::as_string(col_tolval)]])) { - stop("Column '", - rlang::as_string(col_tolval), - "' must be numeric.", call. = FALSE) - }## IF ~ col_tolval is numeric - - ## valid_min is numeric - if (!is.numeric(valid_min)) { - stop("'valid_min' must be numeric.", call. = FALSE) - }## IF ~ valid_min is numeric - - ## valid_min is numeric - if (!is.numeric(valid_min)) { - stop("'valid_min' must be numeric.", call. = FALSE) - }## IF ~ valid_min is numeric - - # occurrence---- - df_match <- df_data |> - # occurrence - dplyr::count(.data[[col_tolval]], name = "n") |> - # valid - ## T/F - dplyr::mutate(valid = dplyr::case_when( - .data[[col_tolval]] >= valid_min & - .data[[col_tolval]] <= valid_max ~ TRUE, - .default = FALSE)) - - # Result---- - return(df_match) - -}## FUNCTION ~ END - diff --git a/data-raw/ProcessData_TaxaMaster_Ben_BCG_PacNW.R b/data-raw/ProcessData_TaxaMaster_Ben_BCG_PacNW.R index f3844dbb..c77253b6 100644 --- a/data-raw/ProcessData_TaxaMaster_Ben_BCG_PacNW.R +++ b/data-raw/ProcessData_TaxaMaster_Ben_BCG_PacNW.R @@ -34,4 +34,3 @@ str(df) # TaxaMaster_Ben_BCG_PacNW <- df usethis::use_data(TaxaMaster_Ben_BCG_PacNW, overwrite = TRUE) - diff --git a/data-raw/ProcessData_taxa_names_issues.R b/data-raw/ProcessData_taxa_names_issues.R new file mode 100644 index 00000000..0d7290ee --- /dev/null +++ b/data-raw/ProcessData_taxa_names_issues.R @@ -0,0 +1,25 @@ +# Prepare data for examples for qc_taxa_names_proof +# From multiple datasets +# Examples of different issues to be caught by function +# +# Erik.Leppo@tetratech.com +# 20260415 +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +# Packages ---- +library(readxl) +library(usethis) + +# Data ---- +dn_data <- file.path("data-raw", "Data") +fn_data <- "BMT_Examples_Taxa_Issues.xlsx" +sh_data <- "example_name_issues" +skip_data <- 7 +df_data <- readxl::read_excel(file.path(dn_data, fn_data), + sheet = sh_data, + skip = skip_data) + +# Save ---- +data_taxa_names_issues <- df_data +usethis::use_data(data_taxa_names_issues, overwrite = TRUE) + diff --git a/data-raw/data/BMT_Examples_Taxa_Issues.xlsx b/data-raw/data/BMT_Examples_Taxa_Issues.xlsx new file mode 100644 index 00000000..4cefb426 Binary files /dev/null and b/data-raw/data/BMT_Examples_Taxa_Issues.xlsx differ diff --git a/data/data_taxa_names_issues.rda b/data/data_taxa_names_issues.rda new file mode 100644 index 00000000..59f1e495 Binary files /dev/null and b/data/data_taxa_names_issues.rda differ diff --git a/inst/extdata/MetricNames.xlsx b/inst/extdata/MetricNames.xlsx index 7ce678a6..01ca53b7 100644 Binary files a/inst/extdata/MetricNames.xlsx and b/inst/extdata/MetricNames.xlsx differ diff --git a/inst/extdata/MetricScoring.xlsx b/inst/extdata/MetricScoring.xlsx index 1127915f..19d10a76 100644 Binary files a/inst/extdata/MetricScoring.xlsx and b/inst/extdata/MetricScoring.xlsx differ diff --git a/man/qc_taxa_match_official.Rd b/man/qc_taxa_match_official.Rd index 34dca0fa..12962063 100644 --- a/man/qc_taxa_match_official.Rd +++ b/man/qc_taxa_match_official.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/qc_taxa_match_official.R +% Please edit documentation in R/qc_taxa_names_match.R \name{qc_taxa_match_official} \alias{qc_taxa_match_official} \title{Quality Control Check on User Data Against Master Taxa List} diff --git a/man/qc_taxa_names_proof.Rd b/man/qc_taxa_names_proof.Rd new file mode 100644 index 00000000..05b51a26 --- /dev/null +++ b/man/qc_taxa_names_proof.Rd @@ -0,0 +1,148 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/qc_taxa_names_proof.R +\name{qc_taxa_names_proof} +\alias{qc_taxa_names_proof} +\title{QC Taxa List Proofreading} +\usage{ +qc_taxa_names_proof(names, method = "jw", max_distance = 0.13) +} +\arguments{ +\item{names}{A character vector containing taxa name data.} + +\item{method}{String distance method (passed to stringdist). +Default = "jw"} + +\item{max_distance}{Numeric threshold for similarity. Default = 0.13} +} +\value{ +A data frame with col_tolval values, occurrence (n), and if valid +(TRUE/FALSE). +} +\description{ +Performs basic proofreading of names in a taxa list. +} +\details{ +Returns possible differences in a data frame with three columns (qc check, +name, potential match(es)). Not all hits are errors but are potential issues +that may need to be addressed. + +The distance check Computes pairwise string distances between names and +returns name pairs that are likely duplicates. + +Uses Jaro-Winkler (jw) distance by default which performs well for names. +Other options are Levenshtein (lv), good for typos, and osa, like Levenshtein +but slightly faster. + +Good thresholds are jw 0.1 to 0.2, lv and osa <= 2 + +The checks include: + +* **spaces**, leading or trailing, including html white space, or doulble +space, or more than 3 + +* **case**, differences + +* **sp** variants; (with/without .) sp and spp, inside next to slash + +* **stage** variants; adult, A, pupa, pupae, P, immature, I, imm, juv, +juvenile, larva, larvae, L, zoea, myses, mysops? + +* **probably**, variants; "?", " prob ", " prob. ", " probably " +* add parentheses + +* **cf**, variants start, or in string, cf, c.f., cf., c.f + +* backslash_dash_underscore + +complex cmplx + +all caps + +and, & + +star + +head + +possibly, poss, poss. + + +unknown unk undetermined undet(.), indet, indetermined + + large small with space or parentheses + + backslash_dash + +* **slash, direction** direction; including dash + +* **slash, taxa** x/y vs. y/x + +* **grp** variants; grp, gr, group, (with/without .) and without +and dash and genus group, gp, dash or space before + +* **unid** variants; unid, unidentified, unid diff, uid, (with/without .) + +diff without unid + +* **prob** variants; prob, prob., probably, including "?" (anywhere in text) + +* **sensu** + +* **parenthetical** text; sensu, prob, inc spec, (with/without .) + +* **near** variants; nr n + +aff. , f +flag + +quotes + +slash order; c/o vs o/c + +with, without, w/, w/o, w/ o, w /, w / o + +frag and fragment + +Tubificid + +* **colon** e.g., Family: Genus + +* **patterns** tera$ in Order, idae$ in Family, inae$ Subfamily, and +ini$ in Tribe. Look for those patterns not in the expected columns. +would need the entire taxa table. Right now only looking at a single vector. + +immature, imm, w/ and w/o hair chaetae, hair+pectinate, bifid +setae, chaetae + +Common authors not in parentheses, e.g., Epler + + +text mining algorithms (word similarity) +Other checks caught: + +some not included: + +* f. = forma = valid +} +\examples{ +# Example Issues +proof_issues <- qc_taxa_names_proof(data_taxa_names_issues$FinalID) +proof_issues$issues +lapply(proof_issues, nrow) + + +# Example Master Taxa Lists +proof_MBSS <- qc_taxa_names_proof(data_benthos_MBSS$TAXAID, "jw", 0.13) +proof_MBSS$issues +head(proof_MBSS$distance) + +proof_PacNW_taxaid <- qc_taxa_names_proof(data_benthos_PacNW$TaxaID) +proof_PacNW_taxaid$issues +head(proof_PacNW_taxaid$distance) + +proof_PacNW_master_taxaid <- qc_taxa_names_proof(TaxaMaster_Ben_BCG_PacNW$TaxaID) +proof_PacNW_master_taxaid$issues +proof_PacNW_master_taxaid$stage +head(proof_PacNW_taxaid$distance) + +} diff --git a/man/qc_taxa_phylo.Rd b/man/qc_taxa_phylo.Rd new file mode 100644 index 00000000..eb0a4cd1 --- /dev/null +++ b/man/qc_taxa_phylo.Rd @@ -0,0 +1,86 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/qc_taxa_phylo.R +\name{qc_taxa_phylo} +\alias{qc_taxa_phylo} +\title{QC Taxa Phylo} +\usage{ +qc_taxa_phylo( + data, + finalid = "FinalID", + phylo_names = c("Phylum", "Subphylum", "Class", "Subclass", "Order", "Suborder", + "Family", "Subfamily", "Tribe", "Genus"), + ignore_case = FALSE +) +} +\arguments{ +\item{data}{A data frame} + +\item{finalid}{Column name for FinalID. Default = "FinalID"} + +\item{phylo_names}{Vector of phylogenetic names in order from coarse to fine. +Default = c("Phylum", "Subphylum", "Class", "Subclass", "Order", "Suborder", +"Family", "Subfamily", "Tribe", "Genus")} + +\item{ignore_case}{Should case be ignore for checks. Default = FALSE.} +} +\value{ +A list elements for each qc check. +} +\description{ +Performs basic quality control on a phylogenetic list. +} +\details{ +Returns a list of with elements corresponding the various checks on +a phylogenetic (standard or master) taxa list. + +The phylogenetic list is multiple columns that the user will provide in rank +order from coarse to fine along with a FinalID column. + +The checks are listed below and only report the entries that fail. + +Some checks will detect those with potential issues but others that are +valid. + +* **unique_parent** Each taxonomic rank (child) has a unique parent (coarser rank). +Parents include all coarser ranks (as defined by user). + +* **phylo_unique_rank** Each name is in only one phylogenetic rank column + +* **phylo_as_finalid** Each phylogenetic name is also in FinalID + +* **finalid_as_phylo** Each final id is a phylogenetic name + +others checks? + +case (all lower, all upper) + +spaces +? +non A-Z (any case), e.g., slash, dash, underscore, parentheses, etc. + +NEED TRIGGER FOR ignore.case, default to FALSE + +If ignore_case is TRUE then all columns (finalid and phylo_names) will be +converted to upper case before checks are performed. +} +\examples{ +qc_phylo_PacNW <- qc_taxa_phylo(TaxaMaster_Ben_BCG_PacNW, + "TaxaID", + phylo_names = c("Phylum", + "SubPhylum", + "Class", + "SubClass", + "Order", + "SuperFamily", + "Family", + "Tribe", + "Genus", + "SubGenus", + "Species")) +qc_phylo_PacNW$issues +qc_phylo_PacNW$unique_parent +qc_phylo_PacNW$phylo_unique_rank +qc_phylo_PacNW$phylo_as_finalid +qc_phylo_PacNW$finalid_as_phylo + +} diff --git a/man/qc_taxa_values_char.Rd b/man/qc_taxa_values_char.Rd new file mode 100644 index 00000000..05d4923e --- /dev/null +++ b/man/qc_taxa_values_char.Rd @@ -0,0 +1,89 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/qc_taxa_values_character.R +\name{qc_taxa_values_char} +\alias{qc_taxa_values_char} +\title{QC Autecological Character Values} +\usage{ +qc_taxa_values_char(data, col_char = NULL, valid_vals = NULL, separator = NULL) +} +\arguments{ +\item{data}{A data frame containing autecological taxa data.} + +\item{col_char}{The column containing the character values to be checked.} + +\item{valid_vals}{Accepted values.} + +\item{separator}{If values should be separated and checked include a +delimiter. Default = NULL} +} +\value{ +A data frame with col_char values, occurrence (n), and if valid (TRUE/ +FALSE). Any missing valid_vals are appended. +} +\description{ +Performs basic QC of a character column against a list of accepted values. +} +\details{ +Returns a data frame the values from the input with counts (column = n) from +the column and whether the values appeared in valid values (column = +valid). Values in the accepted values not appearing in the input are appended +to the bottom of the returned data frame. These values are marked as n = NA +and valid = TRUE. If NA is a valid value it must be included in valid_vals +or in the output NA will be labeled as valid = FALSE. + +The default accepted values are the abbreviations are those used as +metric.values(). + +For Function Feeding Group (FFG); CF, CG, MH, OM, PA, PI, PR, SC, SH, and XY. User using FC +and GC over CF and CG can modify the accepted values. Both versions are +accepted in metric.values(). + +For Habit; BU, CB, CN, SK, SP, and SW. Valid separated with "," are +first split apart and spaces removed. + +life cycle + +bcg_attr + +habitat, habitat structure, elevation, gradient, thermal +} +\examples{ +Values, FFG, Abr +qc_taxa_values_char(data_benthos_PacNW, + "FFG", + valid_vals = c("CF", + "CG", + "MH", + "OM", + "PA", + "PH", + "PI", + "PR", + "SC", + "SH", + "XY", + NA)) + +# Values, FFG, full names +qc_taxa_values_char(data_benthos_MBSS, + "FFG", + valid_vals = c("Collector", + "Filterer", + "Predator", + "Scraper", + "Shredder")) + +# Values, Habit, no separator +qc_taxa_values_char(data_benthos_MBSS, + "Habit", + valid_vals = c("bu", "cb", "cn", "dv", "sk", "sp", "sw")) + +# Values, Habit, no separator +qc_taxa_values_char(data_benthos_MBSS, + "Habit", + valid_vals = c("bu", "cb", "cn", "dv", "sk", "sp", "sw"), + separator = ",") + + + +} diff --git a/man/qc_taxa_values_ffg.Rd b/man/qc_taxa_values_ffg.Rd deleted file mode 100644 index 1d290082..00000000 --- a/man/qc_taxa_values_ffg.Rd +++ /dev/null @@ -1,53 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/qc_taxa_values_ffg.R -\name{qc_taxa_values_ffg} -\alias{qc_taxa_values_ffg} -\title{QC Functional Feeding Group (FFG) Values} -\usage{ -qc_taxa_values_ffg( - df_data, - col_ffg = "FFG", - valid_vals = c("CF", "CG", "MH", "OM", "PA", "PH", "PI", "PR", "SC", "SH", "XY") -) -} -\arguments{ -\item{df_data}{A data frame containing taxa data.} - -\item{col_ffg}{The column containing FFG values. Default = "FFG"} - -\item{valid_vals}{Accepted values. -Default = c(CF, CG, MH, OM ,PA, PH, PI, PR, SC, SH, XY)} -} -\value{ -A data frame with col_ffg values, occurrence (n), and if valid (TRUE/ -FALSE). Additional values from valid_vals are appended. -} -\description{ -Performs basic QC of the FFG column against a list of accepted values. -} -\details{ -Returns a data frame the values from the input with counts (column = n) from -the FFG column and whether the value appeared in valid values (column = -valid). Values in the accepted values not appearing in the input are appended -to the bottom of the returned data frame. These values are marked as n = NA -and valid = TRUE. - -The default accepted values are the abbreviations are those used as -metric.values(); CF, CG, MH, OM, PA, PI, PR, SC, SH, and XY. User using FC -and GC over CF and CG can modify the accepted values. Both versions are -accepted in metric.values(). -} -\examples{ -# Values, Default -qc_taxa_values_ffg(data_benthos_PacNW) - -# Values, User (full names) -qc_taxa_values_ffg(data_benthos_MBSS, - "FFG", - valid_vals = c("Collector", - "Filterer", - "Predator", - "Scraper", - "Shredder")) - -} diff --git a/man/qc_taxa_values_habit.Rd b/man/qc_taxa_values_habit.Rd deleted file mode 100644 index c966a943..00000000 --- a/man/qc_taxa_values_habit.Rd +++ /dev/null @@ -1,48 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/qc_taxa_values_habit.R -\name{qc_taxa_values_habit} -\alias{qc_taxa_values_habit} -\title{QC Habitat Values} -\usage{ -qc_taxa_values_habit( - df_data, - col_habit = "Habit", - valid_vals = c("BU", "CB", "CN", "SK", "SP", "SW") -) -} -\arguments{ -\item{df_data}{A data frame containing taxa data.} - -\item{col_habit}{The column containing Habit values. Default = "Habit"} - -\item{valid_vals}{Accepted values. -Default = c(BU, CB, CN, SK ,SP, SW.)} -} -\value{ -A data frame with col_habit values, occurrence (n), and if valid -(TRUE/FALSE). Additional values from valid_vals are appended. -} -\description{ -Performs basic QC of the Habit column against a list of accepted values. -} -\details{ -Returns a data frame the values from the input with counts (column = n) from -the Habit column and whether the value appeared in valid values (column = -valid). Values in the accepted values not appearing in the input are appended -to the bottom of the returned data frame. These values are marked as n = NA -and valid = TRUE. - -The default accepted values are the abbreviations are those used as -metric.values(); BU, CB, CN, SK, SP, and SW. Valid separated with "," are -first split apart and spaces removed. -} -\examples{ -# Values, Default -qc_taxa_values_habit(data_benthos_MBSS) - -# Values, User -qc_taxa_values_habit(data_benthos_MBSS, - "Habit", - valid_vals = c("bu", "cb", "cn", "dv", "sk", "sp", "sw")) - -} diff --git a/man/qc_taxa_values_logical.Rd b/man/qc_taxa_values_logical.Rd new file mode 100644 index 00000000..eec009a3 --- /dev/null +++ b/man/qc_taxa_values_logical.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/qc_taxa_values_logical.R +\name{qc_taxa_values_logical} +\alias{qc_taxa_values_logical} +\title{QC Autecological Logical Values} +\usage{ +qc_taxa_values_logical(data, col_logical = NULL) +} +\arguments{ +\item{data}{A data frame containing autecological taxa data.} + +\item{col_vals}{The column containing Tolerance Values. Default = "TolVal"} +} +\value{ +A data frame with col_vals values, occurrence (n), and valid +(TRUE/FALSE). Missing values (TRUE, FALSE, or NA) are appended. +} +\description{ +Performs basic QC of a logical column showing occurrence. +} +\details{ +Returns a data frame the values from the input with counts (column = n) from +the column. +} +\examples{ +# Exclude +qc_taxa_values_logical(data_benthos_MBSS, "EXCLUDE") + +# NonTarget +qc_taxa_values_logical(data_benthos_MBSS, "NONTARGET") + +} diff --git a/man/qc_taxa_values_numeric.Rd b/man/qc_taxa_values_numeric.Rd new file mode 100644 index 00000000..dfb0159b --- /dev/null +++ b/man/qc_taxa_values_numeric.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/qc_taxa_values_numeric.R +\name{qc_taxa_values_numeric} +\alias{qc_taxa_values_numeric} +\title{QC Autecological Numeric Values} +\usage{ +qc_taxa_values_numeric( + data, + col_numeric = NULL, + valid_min = NULL, + valid_max = NULL +) +} +\arguments{ +\item{data}{A data frame containing autecological taxa data.} + +\item{valid_min}{Valid values range minimum (inclusive). Default = NA.} + +\item{valid_max}{Valid values range maximum (inclusive). Default = NA.} + +\item{col_vals}{The column containing Tolerance Values. Default = NA.} +} +\value{ +A data frame with col_vals values, occurrence (n), and valid +(TRUE/FALSE) within range of valid_min and valid_max. +} +\description{ +Performs basic QC of a numeric column showing all values. +} +\details{ +Returns a data frame the values from the input with counts (column = n) from +the column. Given valid_min and valid_max are applied to each values and +evaluated as valid TRUE or FALSE. + +The accepted values for TolVal are 0 - 10. + +The BioMonTools accepted values for UFC are 1 - 6. +} +\examples{ +# TolVal +qc_taxa_values_numeric(data_benthos_MBSS, "TOLVAL", 0, 10) + +# TolVal2 +qc_taxa_values_numeric(data_benthos_MBSS, "TOLVAL2", 0, 10) + +# UFC +qc_taxa_values_numeric(data_benthos_MBSS, "UFC", 1, 6) + +} diff --git a/man/qc_taxa_values_tolval.Rd b/man/qc_taxa_values_tolval.Rd deleted file mode 100644 index fef44730..00000000 --- a/man/qc_taxa_values_tolval.Rd +++ /dev/null @@ -1,40 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/qc_taxa_values_tolval.R -\name{qc_taxa_values_tolval} -\alias{qc_taxa_values_tolval} -\title{QC Habitat Values} -\usage{ -qc_taxa_values_tolval( - df_data, - col_tolval = "TolVal", - valid_min = 0, - valid_max = 10 -) -} -\arguments{ -\item{df_data}{A data frame containing taxa data.} - -\item{col_tolval}{The column containing Tolerance Values. Default = "TolVal"} - -\item{valid_min}{Valid values range minimum. Default = 0.} - -\item{valid_max}{Valid values range maximum. Default = 10.} -} -\value{ -A data frame with col_tolval values, occurrence (n), and if valid -(TRUE/FALSE). -} -\description{ -Performs basic QC of the Tolerance Value column. -} -\details{ -Returns a data frame the values from the input with counts (column = n) from -the TolVal column and whether the value appeared in valid values (column = -valid). - -The default accepted values are 0 - 10. -} -\examples{ -qc_taxa_values_tolval(data_benthos_MBSS, "TOLVAL") - -} diff --git a/tests/testthat/test_qc_taxa_values_foo.R b/tests/testthat/test_qc_taxa_values_foo.R new file mode 100644 index 00000000..f4005c5b --- /dev/null +++ b/tests/testthat/test_qc_taxa_values_foo.R @@ -0,0 +1,36 @@ +# Test qc_taxa_values_* +# Erik.Leppo@tetratech.com +# 2026-04-05 +#~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Test error conditions and expected outputs +#~~~~~~~~~~~~~~~~~~~~~~~~~~ + +# logical ---- +testthat::test_that("qc_taxa_values, logical, errors", { + # data + data <- BioMonTools::data_benthos_MBSS + + # data, missing + testthat::expect_error(qc_taxa_values_logical()) + + # column, missing + testthat::expect_error(qc_taxa_values_logical(data)) + + # column, wrong case + testthat::expect_error(qc_taxa_values_logical(data, "exclude")) + + # column, wrong class (numeric) + testthat::expect_error(qc_taxa_values_logical(data, "TOLVAL")) + + # column, wrong class (character) + testthat::expect_error(qc_taxa_values_logical(data, "FFG")) + + # no error + testthat::expect_no_error(qc_taxa_values_logical(data, "EXCLUDE")) + + +})## logical ~ errors + +# numeric ---- + +# character ----