Random coding tips I always forget: 50+ tips for tidyverse, purrr, stringr, lubridate, janitor and other packages

Packages we will need:

library(rnaturalearth)
library(tidyverse)
library(skimr)
library(lubridate)
library(magrittr)

I use this post to keep code bits all in one place so I can check back here when I inevitably forget them.

Forget Will Smith GIF - Find & Share on GIPHY

For most of the snippets, we can use a map data.frame that we can download from the rnaturalearth package. So the code below downloads a map of the world.

my_map <- ne_countries(scale = "medium", returnclass = "sf")

my_map %>% View

my_map %<>% select(sovereignt,
iso_a2,
pop_est,
gdp_md_est,
economy,
income_grp,
region_un:region_wb)

my_map %<>% select(-geometry) %>% as_tibble()
  1. How to KEEP only one data.frame from the R environment.
all_objects <- ls()
objects_to_remove <- setdiff(all_objects, "my_map")
rm(list = objects_to_remove, envir = .GlobalEnv)

2. Add an ID variable based on row number

my_map %<>% mutate(id = row_number())

3. Replace NAs across all the df with 0 (also using the assignment operator from magrittr pacakge)

my_map %<>% 
  mutate_all(~replace_na(., 0))

4. Plot missing variables in a data.frame

library(DataExplorer)
plot_missing(my_map)

5. Summarise all variables with skimr package

library(skimr)
skim(df)

6. Reverse score a variable

df %>%
mutate(reversed_score_var = max(score_var) + min(score_var) - score_var)

Lots and lots of stringr package stuff (and a bit of regex)

7. To remove footnote brackets – like [4] and [11] - from a string

 df <- df %>%
mutate(column = str_replace_all(column, "\\[[0-9]+\\]", ""))

\\[ : Matches the opening square bracket

[0-9]+ : Matches one or more digits

\\]: Matches the closing square bracket

8. Remove a string pattern from all variables in a data.frame

my_map %<>%
rename_all(~str_remove(., "_map"))

9. How to extract s substring based on a pattern

my_map %<>%
mutate(my_pattern_substring = str_extract(my_string_variable, "my_pattern"))

10. To concatenate (link together) strings

str_c("a", "b", "c") 
[1] "abc"

11. And how to compute the length of strings

str_length("abcdefme")  
[1] 8

12. Extract substrings from a character variable

my_map %<>% 
mutate(income_substring = substr(income_grp,
start = 1, stop = 3))

13. Split a string into pieces

str_split("a,b,c", ","str_split("Merry, Christmas, to, you", ",")  
[[1]]
[1] "Merry"      " Christmas" " to"        " you" 

14. Replace matched patterns in a string

my_map %<>% 
mutate(earning_grp =str_replace(income_grp, "income", "earning"))
1 4. Lower middle earning     60
2 3. Upper middle earning     58
3 2. High earning: nonOECD    46
4 5. Low earning              42
5 1. High earning: OECD       35

15. Detect the presence or absence of a pattern

my_map %<>% 
mutate(asia = str_detect(region_wb, "Asia"))
  asia      n

1 FALSE   127
2 TRUE    114

16. Count the number of occurrences of a pattern

   oecd     n

1 0 160
2 1 81

17. Trim leading and trailing whitespace

str_trim("   abc   ")

Leaving stringr, back to other random code bits

18. Calculates the sum of values across all columns for each row in a data.frame

df %>% rowwise() %>%
mutate(sum = sum(c_across(everything())))

19. Using reduce() function from purrr package to iteratively combine elements in a vector

character_vector <- c("Good", "Will ", "Hunting")

reduce(character_vector, paste0)

20. Finding the maximum value in the disp column

reduce(mtcars$disp, pmax)

Map package code bits

21. Applying a summary function across variables in a data.frame

summary_stats_fun <- function(df, var, grouping_var) {
result <- df %>%
group_by({{ grouping_var }}) %>%
summarise(
count = n(),
sum_var = sum({{ var }}, na.rm = TRUE)
) %>%
arrange(desc(count))
return(result)
}

map_summary_stats <- function(list_of_data, var, grouping_var) {
result <- map(list_of_data, ~ summary_stats_fun(.x, var = var, grouping_var = grouping_var))
return(result)

}

list_of_data <- list(
data.frame(country = c("A", "B", "A", "C", "B", "C"), value = c(10, 15, 20, 5, 8, 12)),
data.frame(country = c("A", "A", "B", "B", "C", "C"), value = c(8, 12, 15, 10, 5, 20))
)

result_summary_stats <- map_summary_stats(list_of_data, var = "value", grouping_var = "country")

22. How to remove rows from a data.frame that match a string pattern

df <- df %>%
filter(!grepl("pattern", column))

23. How to remove non-numeric characters

df <- df %>%
mutate(column = str_replace_all(column, "[^0-9]", ""))

24. Removing parentheses and contents within

df <- df %>%
mutate(column = str_replace_all(column, "\\(.*?\\)", ""))

25. How to split a string into two new variables

df <- df %>%
separate(column, into = c("new_col_1", "new_col_2"), sep = ",
")

26. Extracting alphabetic characters

df <- df %>%
mutate(alpha_chars = str_extract_all(column, "[A-Za-z]"))

27. Remove scientific notation

old_scipen <- options("scipen") # Save the current scipen value
options(scipen = 999) # Disable scientific notation
options(old_scipen) # Reset

Some functions from the forcats package for factor variables

28. Count the occurrences of each level in a factor

fct_count(factor_variable)

29. Order levels by their frequency.

fct_infreq(factor_variable)

30. Lump levels into a specified number of top or bottom levels.

fct_lump(factor_variable, n = 5)

31. Collapse factor levels into broader categories.

fct_collapse(factor_variable, new_levels = c("Category1", "Category2"))

32. Relabel factor levels

fct_relabel(factor_variable, new_labels = c("Label1", "Label2"))

33. Reverse the order of factor levels.

fct_rev(factor_variable)

34. Make NAs explicit by adding a level for missing values.

fct_explicit_na(factor_variable)

35. Group infrequent levels into “Other” category.

fct_other(factor_variable, keep = 5)

36. Create a cross-tabulation of two factors.

fct_cross(factor1, factor2)

37. Recode factor levels.

fct_recode(factor_variable, new_levels = c("NewLevel1" = "OldLevel1", "NewLevel2" = "OldLevel2"))

38. Count the occurrences of each level in a factor.

fct_count(factor_variable)

And next we will look at lubridate functions I always need to look up. Dates are a pain.

39. Parse date character in the “Year-Month-Day” format.

ymd("2023-12-16")

40. Get the current date and time.

now()

41. Get the current date only

today()

42. How to extract a year from a date class.

year(ymd("2023-12-16"))

43. And to extract the hour from the time now

hour(now())

44. How to extrac the day of the week from a data class

wday(ymd("2023-12-16"))

45. Rounding up or rounding down to the nearest time unit

floor_date(now(), "months")

ceiling_date(now(), "hours")

46. Create an interval object.

my_interval <- interval(start = ymd("2023-01-01"), end = ymd("2023-12-31"))

47. Get the timezone of date and time now

timezone(now())

And last the janitor package for cleaning variables

48. Clean names with lowercase letters and _underscores_

library(janitor)
cleaned_data <- clean_names(original_data)

49. Clean and remove away empty rows and columns from a data.frame

cleaned_data <- remove_empty(original_data)

50. How to remove columns with constant values

cleaned_data <- remove_constant(original_data)

51. How to find duplicate rows in a data.frame

duplicate_rows <- get_dupes(original_data, columns = c("col1", "col2"))

52. How to add percentage sign (%) to a contingency table

table_with_percentages <- tabyl(original_data, col1, col2) %>%
adorn_percentages("row")

53. Add row or column counts to a contingency table

table_with_counts <- tabyl(original_data, col1, col2) %>%
adorn_ns()

54: Changing Data Types

df %>%
mutate(across(starts_with("num"), as.character))

55. Scaling Numeric Variables

df %>%
mutate(across(where(is.numeric), scale))

56. Applying a Custom Function

df %>%
mutate(across(contains("price"), ~ .x * 1.1))

57. Filter based on a condition

df %>%
filter(across(ends_with("score"), ~ .x > 80))

58. Inpute a missing variable

df %>%
mutate(across(starts_with("var"), ~ifelse(is.na(.x), mean(.x, na.rm = TRUE), .x)))

59. Selecting Columns with Specific Data Types:

df %>%
select(across(where(is.character)))

60. Rename columns with paste()

df %>%
rename(across(contains("old"), ~ paste0("new_", .x)))

61. How to calculate row sums

df %>%
mutate(total = rowSums(across(starts_with("quantity"))))

62. Group-wise scaling

df %>%
group_by(category) %>%
mutate(across(starts_with("value"), scale))

63. How to conditionally mutate

  df %>% mutate(across(starts_with("sales"), ~ ifelse(.x > 100, "High", "Low")))

64. Output tidy regression model without scientific notatio

plm(lead(dep_var, 2) ~ ind_var, index = c("country", "year"), 
data = df) %>%
broom::tidy() %>%
arrange(desc(estimate)) %>%
mutate(across(c(estimate, std.error, statistic, p.value), ~sprintf("%.10f", .)))
print(n = 100)

65. How to choose the reference factor for a regression

df %<>%
mutate(factor_var = relevel(as.factor(factor_var), ref = "ref_level"))

66. Remove variables that end with a character string.

vdem %<>%
select(-ends_with("_sd"),
-ends_with("_codelow"),
-ends_with("_codehigh"),
-ends_with("_3C"),
-ends_with("_4C"),
-ends_with("_5C"))

Can you add more code snippets in the commets???

Schitts Creek Yes GIF by CBC - Find & Share on GIPHY