• all_lower_case(): Translate all non-numeric strings of a data frame to lower case ( "Env" to "env").

  • all_upper_case(): Translate all non-numeric strings of a data frame to upper case (e.g., "Env" to "ENV").

  • all_title_case(): Translate all non-numeric strings of a data frame to title case (e.g., "ENV" to "Env").

  • extract_number(): Extract the number(s) of a string.

  • extract_string(): Extract all strings, ignoring case.

  • find_text_in_num(): Find text characters in a numeric sequence and return the row index.

  • has_text_in_num(): Inspect columns looking for text in numeric sequence and return a warning if text is found.

  • remove_space(): Remove all blank spaces of a string.

  • remove_strings(): Remove all strings of a variable.

  • replace_number(): Replace numbers with a replacement.

  • replace_string(): Replace all strings with a replacement, ignoring case.

  • round_cols(): Round a selected column or a whole data frame to significant figures.

  • tidy_strings(): Tidy up characters strings, non-numeric columns, or any selected columns in a data frame by putting all word in upper case, replacing any space, tabulation, punctuation characters by '_', and putting '_' between lower and upper case. Suppose that str = c("Env1", "env 1", "env.1") (which by definition should represent a unique level in plant breeding trials, e.g., environment 1) is subjected to tidy_strings(str): the result will be then c("ENV_1", "ENV_1", "ENV_1"). See Examples section for more examples.

all_upper_case(.data, ...)

all_lower_case(.data, ...)

all_title_case(.data, ...)

extract_number(
  .data,
  var,
  new_var = new_var,
  drop = FALSE,
  pull = FALSE,
  .before = NULL,
  .after = NULL
)

extract_string(
  .data,
  var,
  new_var = new_var,
  drop = FALSE,
  pull = FALSE,
  .before = NULL,
  .after = NULL
)

find_text_in_num(.data, ...)

has_text_in_num(.data)

remove_space(.data, ...)

remove_strings(.data, ...)

replace_number(
  .data,
  var,
  new_var = new_var,
  pattern = NULL,
  replacement = "",
  drop = FALSE,
  pull = FALSE,
  .before = NULL,
  .after = NULL
)

replace_string(
  .data,
  var,
  new_var = new_var,
  pattern = NULL,
  replacement = "",
  ignore_case = FALSE,
  drop = FALSE,
  pull = FALSE,
  .before = NULL,
  .after = NULL
)

round_cols(.data, ..., digits = 2)

tidy_strings(.data, ..., sep = "_")

Arguments

.data

A data frame

...

The argument depends on the function used.

  • For round_cols() ... are the variables to round. If no variable is informed, all the numeric variables from data are used.

  • For all_lower_case(), all_upper_case(), all_title_case(), remove_strings(), and tidy_strings() ... are the variables to apply the function. If no variable is informed, the function will be applied to all non-numeric variables in .data.

var

The variable to extract or replace numbers or strings.

new_var

The name of the new variable containing the numbers or strings extracted or replaced. Defaults to new_var.

drop

Logical argument. If TRUE keeps the new variable new_var and drops the existing ones. Defaults to FALSE.

pull

Logical argument. If TRUE, returns the last column (on the assumption that's the column you've created most recently), as a vector.

.before, .after

For replace_sting(), replace_number(), extract_string(), ,and extract_number() one-based column index or column name where to add the new columns.

pattern

A string to be matched. Regular Expression Syntax is also allowed.

replacement

A string for replacement.

ignore_case

If FALSE (default), the pattern matching is case sensitive and if TRUE, case is ignored during matching.

digits

The number of significant figures.

sep

A character string to separate the terms. Defaults to "_".

Author

Tiago Olivoto tiagoolivoto@gmail.com

Examples

# \donttest{ library(metan) ################ Rounding numbers ############### # All numeric columns round_cols(data_ge2, digits = 1)
#> # A tibble: 156 x 18 #> ENV GEN REP PH EH EP EL ED CL CD CW KW NR #> <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> #> 1 A1 H1 1 2.6 1.7 0.7 16.1 52.2 28.1 16.3 25.1 216. 15.6 #> 2 A1 H1 2 2.9 1.8 0.6 14.2 50.3 27.6 14.5 21.4 184. 16 #> 3 A1 H1 3 2.7 1.6 0.6 16 50.7 28.4 16.4 24 208. 17.2 #> 4 A1 H10 1 2.8 1.6 0.6 16.7 54.1 31.7 17.4 26.2 194. 15.6 #> 5 A1 H10 2 2.8 1.7 0.6 14.9 52.7 32 15.5 20.7 176. 17.6 #> 6 A1 H10 3 2.7 1.5 0.6 16.7 52.7 30.4 17.5 26.8 207. 16.8 #> 7 A1 H11 1 2.8 1.5 0.5 17.4 51.7 30.6 18 26.2 217. 16.8 #> 8 A1 H11 2 2.7 1.6 0.6 16.7 47.2 28.7 17.2 24.1 181. 13.6 #> 9 A1 H11 3 2.8 1.7 0.6 15.8 47.9 27.6 16.4 20.5 166. 15.2 #> 10 A1 H12 1 2.7 1.5 0.6 14.9 47.5 28.2 15.5 20.1 161 14.8 #> # ... with 146 more rows, and 5 more variables: NKR <dbl>, CDED <dbl>, #> # PERK <dbl>, TKW <dbl>, NKE <dbl>
# Round specific columns round_cols(data_ge2, EP, digits = 1)
#> # A tibble: 156 x 18 #> ENV GEN REP PH EH EP EL ED CL CD CW KW NR #> <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> #> 1 A1 H1 1 2.61 1.71 0.7 16.1 52.2 28.1 16.3 25.1 217. 15.6 #> 2 A1 H1 2 2.87 1.76 0.6 14.2 50.3 27.6 14.5 21.4 184. 16 #> 3 A1 H1 3 2.68 1.58 0.6 16.0 50.7 28.4 16.4 24.0 208. 17.2 #> 4 A1 H10 1 2.83 1.64 0.6 16.7 54.1 31.7 17.4 26.2 194. 15.6 #> 5 A1 H10 2 2.79 1.71 0.6 14.9 52.7 32.0 15.5 20.7 176. 17.6 #> 6 A1 H10 3 2.72 1.51 0.6 16.7 52.7 30.4 17.5 26.8 207. 16.8 #> 7 A1 H11 1 2.75 1.51 0.5 17.4 51.7 30.6 18.0 26.2 217. 16.8 #> 8 A1 H11 2 2.72 1.56 0.6 16.7 47.2 28.7 17.2 24.1 181. 13.6 #> 9 A1 H11 3 2.77 1.67 0.6 15.8 47.9 27.6 16.4 20.5 166. 15.2 #> 10 A1 H12 1 2.73 1.54 0.6 14.9 47.5 28.2 15.5 20.1 161. 14.8 #> # ... with 146 more rows, and 5 more variables: NKR <dbl>, CDED <dbl>, #> # PERK <dbl>, TKW <dbl>, NKE <dbl>
########### Extract or replace numbers ########## # Extract numbers extract_number(data_ge, GEN)
#> # A tibble: 420 x 6 #> ENV GEN REP GY HM new_var #> <fct> <fct> <fct> <dbl> <dbl> <dbl> #> 1 E1 G1 1 2.17 44.9 1 #> 2 E1 G1 2 2.50 46.9 1 #> 3 E1 G1 3 2.43 47.8 1 #> 4 E1 G2 1 3.21 45.2 2 #> 5 E1 G2 2 2.93 45.3 2 #> 6 E1 G2 3 2.56 45.5 2 #> 7 E1 G3 1 2.77 46.7 3 #> 8 E1 G3 2 3.62 43.2 3 #> 9 E1 G3 3 2.28 47.8 3 #> 10 E1 G4 1 2.36 47.9 4 #> # ... with 410 more rows
extract_number(data_ge, var = GEN, drop = TRUE, new_var = g_number)
#> # A tibble: 420 x 1 #> g_number #> <dbl> #> 1 1 #> 2 1 #> 3 1 #> 4 2 #> 5 2 #> 6 2 #> 7 3 #> 8 3 #> 9 3 #> 10 4 #> # ... with 410 more rows
# Replace numbers replace_number(data_ge, GEN)
#> # A tibble: 420 x 6 #> ENV GEN REP GY HM new_var #> <fct> <fct> <fct> <dbl> <dbl> <chr> #> 1 E1 G1 1 2.17 44.9 G #> 2 E1 G1 2 2.50 46.9 G #> 3 E1 G1 3 2.43 47.8 G #> 4 E1 G2 1 3.21 45.2 G #> 5 E1 G2 2 2.93 45.3 G #> 6 E1 G2 3 2.56 45.5 G #> 7 E1 G3 1 2.77 46.7 G #> 8 E1 G3 2 3.62 43.2 G #> 9 E1 G3 3 2.28 47.8 G #> 10 E1 G4 1 2.36 47.9 G #> # ... with 410 more rows
replace_number(data_ge, var = GEN, pattern = "1", replacement = "_one", pull = TRUE)
#> [1] "G_one" "G_one" "G_one" "G2" "G2" "G2" "G3" "G3" #> [9] "G3" "G4" "G4" "G4" "G5" "G5" "G5" "G6" #> [17] "G6" "G6" "G7" "G7" "G7" "G8" "G8" "G8" #> [25] "G9" "G9" "G9" "G_one0" "G_one0" "G_one0" "G_one" "G_one" #> [33] "G_one" "G2" "G2" "G2" "G3" "G3" "G3" "G4" #> [41] "G4" "G4" "G5" "G5" "G5" "G6" "G6" "G6" #> [49] "G7" "G7" "G7" "G8" "G8" "G8" "G9" "G9" #> [57] "G9" "G_one0" "G_one0" "G_one0" "G_one" "G_one" "G_one" "G2" #> [65] "G2" "G2" "G3" "G3" "G3" "G4" "G4" "G4" #> [73] "G5" "G5" "G5" "G6" "G6" "G6" "G7" "G7" #> [81] "G7" "G8" "G8" "G8" "G9" "G9" "G9" "G_one0" #> [89] "G_one0" "G_one0" "G_one" "G_one" "G_one" "G2" "G2" "G2" #> [97] "G3" "G3" "G3" "G4" "G4" "G4" "G5" "G5" #> [105] "G5" "G6" "G6" "G6" "G7" "G7" "G7" "G8" #> [113] "G8" "G8" "G9" "G9" "G9" "G_one0" "G_one0" "G_one0" #> [121] "G_one" "G_one" "G_one" "G2" "G2" "G2" "G3" "G3" #> [129] "G3" "G4" "G4" "G4" "G5" "G5" "G5" "G6" #> [137] "G6" "G6" "G7" "G7" "G7" "G8" "G8" "G8" #> [145] "G9" "G9" "G9" "G_one0" "G_one0" "G_one0" "G_one" "G_one" #> [153] "G_one" "G2" "G2" "G2" "G3" "G3" "G3" "G4" #> [161] "G4" "G4" "G5" "G5" "G5" "G6" "G6" "G6" #> [169] "G7" "G7" "G7" "G8" "G8" "G8" "G9" "G9" #> [177] "G9" "G_one0" "G_one0" "G_one0" "G_one" "G_one" "G_one" "G2" #> [185] "G2" "G2" "G3" "G3" "G3" "G4" "G4" "G4" #> [193] "G5" "G5" "G5" "G6" "G6" "G6" "G7" "G7" #> [201] "G7" "G8" "G8" "G8" "G9" "G9" "G9" "G_one0" #> [209] "G_one0" "G_one0" "G_one" "G_one" "G_one" "G2" "G2" "G2" #> [217] "G3" "G3" "G3" "G4" "G4" "G4" "G5" "G5" #> [225] "G5" "G6" "G6" "G6" "G7" "G7" "G7" "G8" #> [233] "G8" "G8" "G9" "G9" "G9" "G_one0" "G_one0" "G_one0" #> [241] "G_one" "G_one" "G_one" "G2" "G2" "G2" "G3" "G3" #> [249] "G3" "G4" "G4" "G4" "G5" "G5" "G5" "G6" #> [257] "G6" "G6" "G7" "G7" "G7" "G8" "G8" "G8" #> [265] "G9" "G9" "G9" "G_one0" "G_one0" "G_one0" "G_one" "G_one" #> [273] "G_one" "G2" "G2" "G2" "G3" "G3" "G3" "G4" #> [281] "G4" "G4" "G5" "G5" "G5" "G6" "G6" "G6" #> [289] "G7" "G7" "G7" "G8" "G8" "G8" "G9" "G9" #> [297] "G9" "G_one0" "G_one0" "G_one0" "G_one" "G_one" "G_one" "G2" #> [305] "G2" "G2" "G3" "G3" "G3" "G4" "G4" "G4" #> [313] "G5" "G5" "G5" "G6" "G6" "G6" "G7" "G7" #> [321] "G7" "G8" "G8" "G8" "G9" "G9" "G9" "G_one0" #> [329] "G_one0" "G_one0" "G_one" "G_one" "G_one" "G2" "G2" "G2" #> [337] "G3" "G3" "G3" "G4" "G4" "G4" "G5" "G5" #> [345] "G5" "G6" "G6" "G6" "G7" "G7" "G7" "G8" #> [353] "G8" "G8" "G9" "G9" "G9" "G_one0" "G_one0" "G_one0" #> [361] "G_one" "G_one" "G_one" "G2" "G2" "G2" "G3" "G3" #> [369] "G3" "G4" "G4" "G4" "G5" "G5" "G5" "G6" #> [377] "G6" "G6" "G7" "G7" "G7" "G8" "G8" "G8" #> [385] "G9" "G9" "G9" "G_one0" "G_one0" "G_one0" "G_one" "G_one" #> [393] "G_one" "G2" "G2" "G2" "G3" "G3" "G3" "G4" #> [401] "G4" "G4" "G5" "G5" "G5" "G6" "G6" "G6" #> [409] "G7" "G7" "G7" "G8" "G8" "G8" "G9" "G9" #> [417] "G9" "G_one0" "G_one0" "G_one0"
########## Extract, replace or remove strings ########## # Extract strings extract_string(data_ge, GEN)
#> # A tibble: 420 x 6 #> ENV GEN REP GY HM new_var #> <fct> <fct> <fct> <dbl> <dbl> <chr> #> 1 E1 G1 1 2.17 44.9 G #> 2 E1 G1 2 2.50 46.9 G #> 3 E1 G1 3 2.43 47.8 G #> 4 E1 G2 1 3.21 45.2 G #> 5 E1 G2 2 2.93 45.3 G #> 6 E1 G2 3 2.56 45.5 G #> 7 E1 G3 1 2.77 46.7 G #> 8 E1 G3 2 3.62 43.2 G #> 9 E1 G3 3 2.28 47.8 G #> 10 E1 G4 1 2.36 47.9 G #> # ... with 410 more rows
extract_string(data_ge, var = GEN, drop = TRUE, new_var = g_name)
#> # A tibble: 420 x 1 #> g_name #> <chr> #> 1 G #> 2 G #> 3 G #> 4 G #> 5 G #> 6 G #> 7 G #> 8 G #> 9 G #> 10 G #> # ... with 410 more rows
# Replace strings replace_string(data_ge, GEN)
#> # A tibble: 420 x 6 #> ENV GEN REP GY HM new_var #> <fct> <fct> <fct> <dbl> <dbl> <chr> #> 1 E1 G1 1 2.17 44.9 1 #> 2 E1 G1 2 2.50 46.9 1 #> 3 E1 G1 3 2.43 47.8 1 #> 4 E1 G2 1 3.21 45.2 2 #> 5 E1 G2 2 2.93 45.3 2 #> 6 E1 G2 3 2.56 45.5 2 #> 7 E1 G3 1 2.77 46.7 3 #> 8 E1 G3 2 3.62 43.2 3 #> 9 E1 G3 3 2.28 47.8 3 #> 10 E1 G4 1 2.36 47.9 4 #> # ... with 410 more rows
replace_string(data_ge, var = GEN, new_var = GENOTYPE, pattern = "G", replacement = "GENOTYPE_")
#> # A tibble: 420 x 6 #> ENV GEN REP GY HM GENOTYPE #> <fct> <fct> <fct> <dbl> <dbl> <chr> #> 1 E1 G1 1 2.17 44.9 GENOTYPE_1 #> 2 E1 G1 2 2.50 46.9 GENOTYPE_1 #> 3 E1 G1 3 2.43 47.8 GENOTYPE_1 #> 4 E1 G2 1 3.21 45.2 GENOTYPE_2 #> 5 E1 G2 2 2.93 45.3 GENOTYPE_2 #> 6 E1 G2 3 2.56 45.5 GENOTYPE_2 #> 7 E1 G3 1 2.77 46.7 GENOTYPE_3 #> 8 E1 G3 2 3.62 43.2 GENOTYPE_3 #> 9 E1 G3 3 2.28 47.8 GENOTYPE_3 #> 10 E1 G4 1 2.36 47.9 GENOTYPE_4 #> # ... with 410 more rows
# Remove strings remove_strings(data_ge)
#> # A tibble: 420 x 5 #> ENV GEN REP GY HM #> <dbl> <dbl> <dbl> <dbl> <dbl> #> 1 1 1 1 2.17 44.9 #> 2 1 1 2 2.50 46.9 #> 3 1 1 3 2.43 47.8 #> 4 1 2 1 3.21 45.2 #> 5 1 2 2 2.93 45.3 #> 6 1 2 3 2.56 45.5 #> 7 1 3 1 2.77 46.7 #> 8 1 3 2 3.62 43.2 #> 9 1 3 3 2.28 47.8 #> 10 1 4 1 2.36 47.9 #> # ... with 410 more rows
remove_strings(data_ge, ENV)
#> # A tibble: 420 x 5 #> ENV GEN REP GY HM #> <dbl> <fct> <fct> <dbl> <dbl> #> 1 1 G1 1 2.17 44.9 #> 2 1 G1 2 2.50 46.9 #> 3 1 G1 3 2.43 47.8 #> 4 1 G2 1 3.21 45.2 #> 5 1 G2 2 2.93 45.3 #> 6 1 G2 3 2.56 45.5 #> 7 1 G3 1 2.77 46.7 #> 8 1 G3 2 3.62 43.2 #> 9 1 G3 3 2.28 47.8 #> 10 1 G4 1 2.36 47.9 #> # ... with 410 more rows
############ Find text in numeric sequences ########### mixed_text <- data.frame(data_ge) mixed_text[2, 4] <- "2..503" mixed_text[3, 4] <- "3.2o75" find_text_in_num(mixed_text, GY)
#> [1] 2 3
############# upper, lower and title cases ############ gen_text <- c("GEN 1", "Gen 1", "gen 1") all_lower_case(gen_text)
#> [1] "gen 1" "gen 1" "gen 1"
all_upper_case(gen_text)
#> [1] "GEN 1" "GEN 1" "GEN 1"
all_title_case(gen_text)
#> [1] "Gen 1" "Gen 1" "Gen 1"
# A whole data frame all_lower_case(data_ge)
#> # A tibble: 420 x 5 #> ENV GEN REP GY HM #> <chr> <chr> <chr> <dbl> <dbl> #> 1 e1 g1 1 2.17 44.9 #> 2 e1 g1 2 2.50 46.9 #> 3 e1 g1 3 2.43 47.8 #> 4 e1 g2 1 3.21 45.2 #> 5 e1 g2 2 2.93 45.3 #> 6 e1 g2 3 2.56 45.5 #> 7 e1 g3 1 2.77 46.7 #> 8 e1 g3 2 3.62 43.2 #> 9 e1 g3 3 2.28 47.8 #> 10 e1 g4 1 2.36 47.9 #> # ... with 410 more rows
############### Tidy up messy text string ############## messy_env <- c("ENV 1", "Env 1", "Env1", "env1", "Env.1", "Env_1") tidy_strings(messy_env)
#> [1] "ENV_1" "ENV_1" "ENV_1" "ENV_1" "ENV_1" "ENV_1"
messy_gen <- c("GEN1", "gen 2", "Gen.3", "gen-4", "Gen_5", "GEN_6") tidy_strings(messy_gen)
#> [1] "GEN_1" "GEN_2" "GEN_3" "GEN_4" "GEN_5" "GEN_6"
messy_int <- c("EnvGen", "Env_Gen", "env gen", "Env Gen", "ENV.GEN", "ENV_GEN") tidy_strings(messy_int)
#> [1] "ENV_GEN" "ENV_GEN" "ENV_GEN" "ENV_GEN" "ENV_GEN" "ENV_GEN"
library(tibble) # Or a whole data frame df <- tibble(Env = messy_env, gen = messy_gen, Env_GEN = interaction(Env, gen), y = rnorm(6, 300, 10)) df
#> # A tibble: 6 x 4 #> Env gen Env_GEN y #> <chr> <chr> <fct> <dbl> #> 1 ENV 1 GEN1 ENV 1.GEN1 305. #> 2 Env 1 gen 2 Env 1.gen 2 287. #> 3 Env1 Gen.3 Env1.Gen.3 287. #> 4 env1 gen-4 env1.gen-4 309. #> 5 Env.1 Gen_5 Env.1.Gen_5 272. #> 6 Env_1 GEN_6 Env_1.GEN_6 302.
tidy_strings(df)
#> # A tibble: 6 x 4 #> Env gen Env_GEN y #> <chr> <chr> <chr> <dbl> #> 1 ENV_1 GEN_1 ENV_1_GEN_1 305. #> 2 ENV_1 GEN_2 ENV_1_GEN_2 287. #> 3 ENV_1 GEN_3 ENV_1_GEN_3 287. #> 4 ENV_1 GEN_4 ENV_1_GEN_4 309. #> 5 ENV_1 GEN_5 ENV_1_GEN_5 272. #> 6 ENV_1 GEN_6 ENV_1_GEN_6 302.
# }