all_lower_case()
: Translate all non-numeric strings of a data frame
to lower case (
"Env"
to "env"
).
all_upper_case()
: Translate all non-numeric strings of a data frame
to upper case (e.g., "Env"
to "ENV"
).
all_title_case()
: Translate all non-numeric strings of a data frame
to title case (e.g., "ENV"
to "Env"
).
extract_number()
: Extract the number(s) of a string.
extract_string()
: Extract all strings, ignoring case.
find_text_in_num()
: Find text characters in a numeric sequence and
return the row index.
has_text_in_num()
: Inspect columns looking for text in numeric
sequence and return a warning if text is found.
remove_space()
: Remove all blank spaces of a string.
remove_strings()
: Remove all strings of a variable.
replace_number()
: Replace numbers with a replacement.
replace_string()
: Replace all strings with a replacement, ignoring
case.
round_cols()
: Round a selected column or a whole data frame to
significant figures.
tidy_strings()
: Tidy up characters strings, non-numeric columns, or
any selected columns in a data frame by putting all word in upper case,
replacing any space, tabulation, punctuation characters by '_'
, and
putting '_'
between lower and upper case. Suppose that str =
c("Env1", "env 1", "env.1")
(which by definition should represent a unique
level in plant breeding trials, e.g., environment 1) is subjected to
tidy_strings(str)
: the result will be then c("ENV_1", "ENV_1",
"ENV_1")
. See Examples section for more examples.
all_upper_case(.data, ...) all_lower_case(.data, ...) all_title_case(.data, ...) extract_number( .data, var, new_var = new_var, drop = FALSE, pull = FALSE, .before = NULL, .after = NULL ) extract_string( .data, var, new_var = new_var, drop = FALSE, pull = FALSE, .before = NULL, .after = NULL ) find_text_in_num(.data, ...) has_text_in_num(.data) remove_space(.data, ...) remove_strings(.data, ...) replace_number( .data, var, new_var = new_var, pattern = NULL, replacement = "", drop = FALSE, pull = FALSE, .before = NULL, .after = NULL ) replace_string( .data, var, new_var = new_var, pattern = NULL, replacement = "", ignore_case = FALSE, drop = FALSE, pull = FALSE, .before = NULL, .after = NULL ) round_cols(.data, ..., digits = 2) tidy_strings(.data, ..., sep = "_")
.data | A data frame |
---|---|
... | The argument depends on the function used.
|
var | The variable to extract or replace numbers or strings. |
new_var | The name of the new variable containing the numbers or
strings extracted or replaced. Defaults to |
drop | Logical argument. If |
pull | Logical argument. If |
.before, .after | For |
pattern | A string to be matched. Regular Expression Syntax is also allowed. |
replacement | A string for replacement. |
ignore_case | If |
digits | The number of significant figures. |
sep | A character string to separate the terms. Defaults to "_". |
Tiago Olivoto tiagoolivoto@gmail.com
# \donttest{ library(metan) ################ Rounding numbers ############### # All numeric columns round_cols(data_ge2, digits = 1)#> # A tibble: 156 x 18 #> ENV GEN REP PH EH EP EL ED CL CD CW KW NR #> <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> #> 1 A1 H1 1 2.6 1.7 0.7 16.1 52.2 28.1 16.3 25.1 216. 15.6 #> 2 A1 H1 2 2.9 1.8 0.6 14.2 50.3 27.6 14.5 21.4 184. 16 #> 3 A1 H1 3 2.7 1.6 0.6 16 50.7 28.4 16.4 24 208. 17.2 #> 4 A1 H10 1 2.8 1.6 0.6 16.7 54.1 31.7 17.4 26.2 194. 15.6 #> 5 A1 H10 2 2.8 1.7 0.6 14.9 52.7 32 15.5 20.7 176. 17.6 #> 6 A1 H10 3 2.7 1.5 0.6 16.7 52.7 30.4 17.5 26.8 207. 16.8 #> 7 A1 H11 1 2.8 1.5 0.5 17.4 51.7 30.6 18 26.2 217. 16.8 #> 8 A1 H11 2 2.7 1.6 0.6 16.7 47.2 28.7 17.2 24.1 181. 13.6 #> 9 A1 H11 3 2.8 1.7 0.6 15.8 47.9 27.6 16.4 20.5 166. 15.2 #> 10 A1 H12 1 2.7 1.5 0.6 14.9 47.5 28.2 15.5 20.1 161 14.8 #> # ... with 146 more rows, and 5 more variables: NKR <dbl>, CDED <dbl>, #> # PERK <dbl>, TKW <dbl>, NKE <dbl># Round specific columns round_cols(data_ge2, EP, digits = 1)#> # A tibble: 156 x 18 #> ENV GEN REP PH EH EP EL ED CL CD CW KW NR #> <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> #> 1 A1 H1 1 2.61 1.71 0.7 16.1 52.2 28.1 16.3 25.1 217. 15.6 #> 2 A1 H1 2 2.87 1.76 0.6 14.2 50.3 27.6 14.5 21.4 184. 16 #> 3 A1 H1 3 2.68 1.58 0.6 16.0 50.7 28.4 16.4 24.0 208. 17.2 #> 4 A1 H10 1 2.83 1.64 0.6 16.7 54.1 31.7 17.4 26.2 194. 15.6 #> 5 A1 H10 2 2.79 1.71 0.6 14.9 52.7 32.0 15.5 20.7 176. 17.6 #> 6 A1 H10 3 2.72 1.51 0.6 16.7 52.7 30.4 17.5 26.8 207. 16.8 #> 7 A1 H11 1 2.75 1.51 0.5 17.4 51.7 30.6 18.0 26.2 217. 16.8 #> 8 A1 H11 2 2.72 1.56 0.6 16.7 47.2 28.7 17.2 24.1 181. 13.6 #> 9 A1 H11 3 2.77 1.67 0.6 15.8 47.9 27.6 16.4 20.5 166. 15.2 #> 10 A1 H12 1 2.73 1.54 0.6 14.9 47.5 28.2 15.5 20.1 161. 14.8 #> # ... with 146 more rows, and 5 more variables: NKR <dbl>, CDED <dbl>, #> # PERK <dbl>, TKW <dbl>, NKE <dbl>########### Extract or replace numbers ########## # Extract numbers extract_number(data_ge, GEN)#> # A tibble: 420 x 6 #> ENV GEN REP GY HM new_var #> <fct> <fct> <fct> <dbl> <dbl> <dbl> #> 1 E1 G1 1 2.17 44.9 1 #> 2 E1 G1 2 2.50 46.9 1 #> 3 E1 G1 3 2.43 47.8 1 #> 4 E1 G2 1 3.21 45.2 2 #> 5 E1 G2 2 2.93 45.3 2 #> 6 E1 G2 3 2.56 45.5 2 #> 7 E1 G3 1 2.77 46.7 3 #> 8 E1 G3 2 3.62 43.2 3 #> 9 E1 G3 3 2.28 47.8 3 #> 10 E1 G4 1 2.36 47.9 4 #> # ... with 410 more rowsextract_number(data_ge, var = GEN, drop = TRUE, new_var = g_number)#> # A tibble: 420 x 1 #> g_number #> <dbl> #> 1 1 #> 2 1 #> 3 1 #> 4 2 #> 5 2 #> 6 2 #> 7 3 #> 8 3 #> 9 3 #> 10 4 #> # ... with 410 more rows# Replace numbers replace_number(data_ge, GEN)#> # A tibble: 420 x 6 #> ENV GEN REP GY HM new_var #> <fct> <fct> <fct> <dbl> <dbl> <chr> #> 1 E1 G1 1 2.17 44.9 G #> 2 E1 G1 2 2.50 46.9 G #> 3 E1 G1 3 2.43 47.8 G #> 4 E1 G2 1 3.21 45.2 G #> 5 E1 G2 2 2.93 45.3 G #> 6 E1 G2 3 2.56 45.5 G #> 7 E1 G3 1 2.77 46.7 G #> 8 E1 G3 2 3.62 43.2 G #> 9 E1 G3 3 2.28 47.8 G #> 10 E1 G4 1 2.36 47.9 G #> # ... with 410 more rowsreplace_number(data_ge, var = GEN, pattern = "1", replacement = "_one", pull = TRUE)#> [1] "G_one" "G_one" "G_one" "G2" "G2" "G2" "G3" "G3" #> [9] "G3" "G4" "G4" "G4" "G5" "G5" "G5" "G6" #> [17] "G6" "G6" "G7" "G7" "G7" "G8" "G8" "G8" #> [25] "G9" "G9" "G9" "G_one0" "G_one0" "G_one0" "G_one" "G_one" #> [33] "G_one" "G2" "G2" "G2" "G3" "G3" "G3" "G4" #> [41] "G4" "G4" "G5" "G5" "G5" "G6" "G6" "G6" #> [49] "G7" "G7" "G7" "G8" "G8" "G8" "G9" "G9" #> [57] "G9" "G_one0" "G_one0" "G_one0" "G_one" "G_one" "G_one" "G2" #> [65] "G2" "G2" "G3" "G3" "G3" "G4" "G4" "G4" #> [73] "G5" "G5" "G5" "G6" "G6" "G6" "G7" "G7" #> [81] "G7" "G8" "G8" "G8" "G9" "G9" "G9" "G_one0" #> [89] "G_one0" "G_one0" "G_one" "G_one" "G_one" "G2" "G2" "G2" #> [97] "G3" "G3" "G3" "G4" "G4" "G4" "G5" "G5" #> [105] "G5" "G6" "G6" "G6" "G7" "G7" "G7" "G8" #> [113] "G8" "G8" "G9" "G9" "G9" "G_one0" "G_one0" "G_one0" #> [121] "G_one" "G_one" "G_one" "G2" "G2" "G2" "G3" "G3" #> [129] "G3" "G4" "G4" "G4" "G5" "G5" "G5" "G6" #> [137] "G6" "G6" "G7" "G7" "G7" "G8" "G8" "G8" #> [145] "G9" "G9" "G9" "G_one0" "G_one0" "G_one0" "G_one" "G_one" #> [153] "G_one" "G2" "G2" "G2" "G3" "G3" "G3" "G4" #> [161] "G4" "G4" "G5" "G5" "G5" "G6" "G6" "G6" #> [169] "G7" "G7" "G7" "G8" "G8" "G8" "G9" "G9" #> [177] "G9" "G_one0" "G_one0" "G_one0" "G_one" "G_one" "G_one" "G2" #> [185] "G2" "G2" "G3" "G3" "G3" "G4" "G4" "G4" #> [193] "G5" "G5" "G5" "G6" "G6" "G6" "G7" "G7" #> [201] "G7" "G8" "G8" "G8" "G9" "G9" "G9" "G_one0" #> [209] "G_one0" "G_one0" "G_one" "G_one" "G_one" "G2" "G2" "G2" #> [217] "G3" "G3" "G3" "G4" "G4" "G4" "G5" "G5" #> [225] "G5" "G6" "G6" "G6" "G7" "G7" "G7" "G8" #> [233] "G8" "G8" "G9" "G9" "G9" "G_one0" "G_one0" "G_one0" #> [241] "G_one" "G_one" "G_one" "G2" "G2" "G2" "G3" "G3" #> [249] "G3" "G4" "G4" "G4" "G5" "G5" "G5" "G6" #> [257] "G6" "G6" "G7" "G7" "G7" "G8" "G8" "G8" #> [265] "G9" "G9" "G9" "G_one0" "G_one0" "G_one0" "G_one" "G_one" #> [273] "G_one" "G2" "G2" "G2" "G3" "G3" "G3" "G4" #> [281] "G4" "G4" "G5" "G5" "G5" "G6" "G6" "G6" #> [289] "G7" "G7" "G7" "G8" "G8" "G8" "G9" "G9" #> [297] "G9" "G_one0" "G_one0" "G_one0" "G_one" "G_one" "G_one" "G2" #> [305] "G2" "G2" "G3" "G3" "G3" "G4" "G4" "G4" #> [313] "G5" "G5" "G5" "G6" "G6" "G6" "G7" "G7" #> [321] "G7" "G8" "G8" "G8" "G9" "G9" "G9" "G_one0" #> [329] "G_one0" "G_one0" "G_one" "G_one" "G_one" "G2" "G2" "G2" #> [337] "G3" "G3" "G3" "G4" "G4" "G4" "G5" "G5" #> [345] "G5" "G6" "G6" "G6" "G7" "G7" "G7" "G8" #> [353] "G8" "G8" "G9" "G9" "G9" "G_one0" "G_one0" "G_one0" #> [361] "G_one" "G_one" "G_one" "G2" "G2" "G2" "G3" "G3" #> [369] "G3" "G4" "G4" "G4" "G5" "G5" "G5" "G6" #> [377] "G6" "G6" "G7" "G7" "G7" "G8" "G8" "G8" #> [385] "G9" "G9" "G9" "G_one0" "G_one0" "G_one0" "G_one" "G_one" #> [393] "G_one" "G2" "G2" "G2" "G3" "G3" "G3" "G4" #> [401] "G4" "G4" "G5" "G5" "G5" "G6" "G6" "G6" #> [409] "G7" "G7" "G7" "G8" "G8" "G8" "G9" "G9" #> [417] "G9" "G_one0" "G_one0" "G_one0"########## Extract, replace or remove strings ########## # Extract strings extract_string(data_ge, GEN)#> # A tibble: 420 x 6 #> ENV GEN REP GY HM new_var #> <fct> <fct> <fct> <dbl> <dbl> <chr> #> 1 E1 G1 1 2.17 44.9 G #> 2 E1 G1 2 2.50 46.9 G #> 3 E1 G1 3 2.43 47.8 G #> 4 E1 G2 1 3.21 45.2 G #> 5 E1 G2 2 2.93 45.3 G #> 6 E1 G2 3 2.56 45.5 G #> 7 E1 G3 1 2.77 46.7 G #> 8 E1 G3 2 3.62 43.2 G #> 9 E1 G3 3 2.28 47.8 G #> 10 E1 G4 1 2.36 47.9 G #> # ... with 410 more rowsextract_string(data_ge, var = GEN, drop = TRUE, new_var = g_name)#> # A tibble: 420 x 1 #> g_name #> <chr> #> 1 G #> 2 G #> 3 G #> 4 G #> 5 G #> 6 G #> 7 G #> 8 G #> 9 G #> 10 G #> # ... with 410 more rows# Replace strings replace_string(data_ge, GEN)#> # A tibble: 420 x 6 #> ENV GEN REP GY HM new_var #> <fct> <fct> <fct> <dbl> <dbl> <chr> #> 1 E1 G1 1 2.17 44.9 1 #> 2 E1 G1 2 2.50 46.9 1 #> 3 E1 G1 3 2.43 47.8 1 #> 4 E1 G2 1 3.21 45.2 2 #> 5 E1 G2 2 2.93 45.3 2 #> 6 E1 G2 3 2.56 45.5 2 #> 7 E1 G3 1 2.77 46.7 3 #> 8 E1 G3 2 3.62 43.2 3 #> 9 E1 G3 3 2.28 47.8 3 #> 10 E1 G4 1 2.36 47.9 4 #> # ... with 410 more rowsreplace_string(data_ge, var = GEN, new_var = GENOTYPE, pattern = "G", replacement = "GENOTYPE_")#> # A tibble: 420 x 6 #> ENV GEN REP GY HM GENOTYPE #> <fct> <fct> <fct> <dbl> <dbl> <chr> #> 1 E1 G1 1 2.17 44.9 GENOTYPE_1 #> 2 E1 G1 2 2.50 46.9 GENOTYPE_1 #> 3 E1 G1 3 2.43 47.8 GENOTYPE_1 #> 4 E1 G2 1 3.21 45.2 GENOTYPE_2 #> 5 E1 G2 2 2.93 45.3 GENOTYPE_2 #> 6 E1 G2 3 2.56 45.5 GENOTYPE_2 #> 7 E1 G3 1 2.77 46.7 GENOTYPE_3 #> 8 E1 G3 2 3.62 43.2 GENOTYPE_3 #> 9 E1 G3 3 2.28 47.8 GENOTYPE_3 #> 10 E1 G4 1 2.36 47.9 GENOTYPE_4 #> # ... with 410 more rows# Remove strings remove_strings(data_ge)#> # A tibble: 420 x 5 #> ENV GEN REP GY HM #> <dbl> <dbl> <dbl> <dbl> <dbl> #> 1 1 1 1 2.17 44.9 #> 2 1 1 2 2.50 46.9 #> 3 1 1 3 2.43 47.8 #> 4 1 2 1 3.21 45.2 #> 5 1 2 2 2.93 45.3 #> 6 1 2 3 2.56 45.5 #> 7 1 3 1 2.77 46.7 #> 8 1 3 2 3.62 43.2 #> 9 1 3 3 2.28 47.8 #> 10 1 4 1 2.36 47.9 #> # ... with 410 more rowsremove_strings(data_ge, ENV)#> # A tibble: 420 x 5 #> ENV GEN REP GY HM #> <dbl> <fct> <fct> <dbl> <dbl> #> 1 1 G1 1 2.17 44.9 #> 2 1 G1 2 2.50 46.9 #> 3 1 G1 3 2.43 47.8 #> 4 1 G2 1 3.21 45.2 #> 5 1 G2 2 2.93 45.3 #> 6 1 G2 3 2.56 45.5 #> 7 1 G3 1 2.77 46.7 #> 8 1 G3 2 3.62 43.2 #> 9 1 G3 3 2.28 47.8 #> 10 1 G4 1 2.36 47.9 #> # ... with 410 more rows############ Find text in numeric sequences ########### mixed_text <- data.frame(data_ge) mixed_text[2, 4] <- "2..503" mixed_text[3, 4] <- "3.2o75" find_text_in_num(mixed_text, GY)#> [1] 2 3############# upper, lower and title cases ############ gen_text <- c("GEN 1", "Gen 1", "gen 1") all_lower_case(gen_text)#> [1] "gen 1" "gen 1" "gen 1"all_upper_case(gen_text)#> [1] "GEN 1" "GEN 1" "GEN 1"all_title_case(gen_text)#> [1] "Gen 1" "Gen 1" "Gen 1"# A whole data frame all_lower_case(data_ge)#> # A tibble: 420 x 5 #> ENV GEN REP GY HM #> <chr> <chr> <chr> <dbl> <dbl> #> 1 e1 g1 1 2.17 44.9 #> 2 e1 g1 2 2.50 46.9 #> 3 e1 g1 3 2.43 47.8 #> 4 e1 g2 1 3.21 45.2 #> 5 e1 g2 2 2.93 45.3 #> 6 e1 g2 3 2.56 45.5 #> 7 e1 g3 1 2.77 46.7 #> 8 e1 g3 2 3.62 43.2 #> 9 e1 g3 3 2.28 47.8 #> 10 e1 g4 1 2.36 47.9 #> # ... with 410 more rows############### Tidy up messy text string ############## messy_env <- c("ENV 1", "Env 1", "Env1", "env1", "Env.1", "Env_1") tidy_strings(messy_env)#> [1] "ENV_1" "ENV_1" "ENV_1" "ENV_1" "ENV_1" "ENV_1"#> [1] "GEN_1" "GEN_2" "GEN_3" "GEN_4" "GEN_5" "GEN_6"messy_int <- c("EnvGen", "Env_Gen", "env gen", "Env Gen", "ENV.GEN", "ENV_GEN") tidy_strings(messy_int)#> [1] "ENV_GEN" "ENV_GEN" "ENV_GEN" "ENV_GEN" "ENV_GEN" "ENV_GEN"library(tibble) # Or a whole data frame df <- tibble(Env = messy_env, gen = messy_gen, Env_GEN = interaction(Env, gen), y = rnorm(6, 300, 10)) df#> # A tibble: 6 x 4 #> Env gen Env_GEN y #> <chr> <chr> <fct> <dbl> #> 1 ENV 1 GEN1 ENV 1.GEN1 305. #> 2 Env 1 gen 2 Env 1.gen 2 287. #> 3 Env1 Gen.3 Env1.Gen.3 287. #> 4 env1 gen-4 env1.gen-4 309. #> 5 Env.1 Gen_5 Env.1.Gen_5 272. #> 6 Env_1 GEN_6 Env_1.GEN_6 302.tidy_strings(df)#> # A tibble: 6 x 4 #> Env gen Env_GEN y #> <chr> <chr> <chr> <dbl> #> 1 ENV_1 GEN_1 ENV_1_GEN_1 305. #> 2 ENV_1 GEN_2 ENV_1_GEN_2 287. #> 3 ENV_1 GEN_3 ENV_1_GEN_3 287. #> 4 ENV_1 GEN_4 ENV_1_GEN_4 309. #> 5 ENV_1 GEN_5 ENV_1_GEN_5 272. #> 6 ENV_1 GEN_6 ENV_1_GEN_6 302.# }