Impute the columns of data.frame with its mean, median or mode.
impute_dt(.data, ..., .func = "mode")
.data | A data.frame |
---|---|
... | Columns to select |
.func | Character, "mode" (default), "mean" or "median". Could also define it by oneself. |
A data.table
Pclass <- c(3, 1, 3, 1, 3, 2, 2, 3, NA, NA) Sex <- c('male', 'male', 'female', 'female', 'female', 'female', NA, 'male', 'female', NA) Age <- c(22, 38, 26, 35, NA, 45, 25, 39, 28, 40) SibSp <- c(0, 1, 3, 1, 2, 3, 2, 2, NA, 0) Fare <- c(7.25, 71.3, 7.92, NA, 8.05, 8.46, 51.9, 60, 32, 15) Embarked <- c('S', NA, 'S', 'Q', 'Q', 'S', 'C', 'S', 'C', 'S') data <- data.frame('Pclass' = Pclass, 'Sex' = Sex, 'Age' = Age, 'SibSp' = SibSp, 'Fare' = Fare, 'Embarked' = Embarked) data#> Pclass Sex Age SibSp Fare Embarked #> 1 3 male 22 0 7.25 S #> 2 1 male 38 1 71.30 <NA> #> 3 3 female 26 3 7.92 S #> 4 1 female 35 1 NA Q #> 5 3 female NA 2 8.05 Q #> 6 2 female 45 3 8.46 S #> 7 2 <NA> 25 2 51.90 C #> 8 3 male 39 2 60.00 S #> 9 NA female 28 NA 32.00 C #> 10 NA <NA> 40 0 15.00 Sdata %>% impute_dt() # defalut uses "mode" as `.func`#> Pclass Sex Age SibSp Fare Embarked #> <num> <fctr> <num> <num> <num> <fctr> #> 1: 3 male 22 0 7.25 S #> 2: 1 male 38 1 71.30 S #> 3: 3 female 26 3 7.92 S #> 4: 1 female 35 1 7.25 Q #> 5: 3 female 22 2 8.05 Q #> 6: 2 female 45 3 8.46 S #> 7: 2 female 25 2 51.90 C #> 8: 3 male 39 2 60.00 S #> 9: 3 female 28 2 32.00 C #> 10: 3 female 40 0 15.00 Sdata %>% impute_dt(is.numeric,.func = "mean")#> Pclass Sex Age SibSp Fare Embarked #> <num> <fctr> <num> <num> <num> <fctr> #> 1: 3.00 male 22.00000 0.000000 7.25000 S #> 2: 1.00 male 38.00000 1.000000 71.30000 <NA> #> 3: 3.00 female 26.00000 3.000000 7.92000 S #> 4: 1.00 female 35.00000 1.000000 29.09778 Q #> 5: 3.00 female 33.11111 2.000000 8.05000 Q #> 6: 2.00 female 45.00000 3.000000 8.46000 S #> 7: 2.00 <NA> 25.00000 2.000000 51.90000 C #> 8: 3.00 male 39.00000 2.000000 60.00000 S #> 9: 2.25 female 28.00000 1.555556 32.00000 C #> 10: 2.25 <NA> 40.00000 0.000000 15.00000 Sdata %>% impute_dt(is.numeric,.func = "median")#> Pclass Sex Age SibSp Fare Embarked #> <num> <fctr> <num> <num> <num> <fctr> #> 1: 3.0 male 22 0 7.25 S #> 2: 1.0 male 38 1 71.30 <NA> #> 3: 3.0 female 26 3 7.92 S #> 4: 1.0 female 35 1 15.00 Q #> 5: 3.0 female 35 2 8.05 Q #> 6: 2.0 female 45 3 8.46 S #> 7: 2.0 <NA> 25 2 51.90 C #> 8: 3.0 male 39 2 60.00 S #> 9: 2.5 female 28 2 32.00 C #> 10: 2.5 <NA> 40 0 15.00 S# use user defined function my_fun = function(x){ (max(x,na.rm = TRUE) - min(x,na.rm = TRUE))/2 } data %>% impute_dt(is.numeric,.func = my_fun)#> Pclass Sex Age SibSp Fare Embarked #> <num> <fctr> <num> <num> <num> <fctr> #> 1: 1 male 11.5 1.5 32.025 S #> 2: 1 male 11.5 1.5 32.025 <NA> #> 3: 1 female 11.5 1.5 32.025 S #> 4: 1 female 11.5 1.5 32.025 Q #> 5: 1 female 11.5 1.5 32.025 Q #> 6: 1 female 11.5 1.5 32.025 S #> 7: 1 <NA> 11.5 1.5 32.025 C #> 8: 1 male 11.5 1.5 32.025 S #> 9: 1 female 11.5 1.5 32.025 C #> 10: 1 <NA> 11.5 1.5 32.025 S