Packages we will need:
library(rnaturalearth)
library(tidyverse)
library(skimr)
library(lubridate)
library(magrittr)
I use this post to keep code bits all in one place so I can check back here when I inevitably forget them.
For most of the snippets, we can use a map data.frame that we can download from the rnaturalearth package. So the code below downloads a map of the world.
my_map <- ne_countries(scale = "medium", returnclass = "sf")
my_map %>% View
my_map %<>% select(sovereignt,
iso_a2,
pop_est,
gdp_md_est,
economy,
income_grp,
region_un:region_wb)
my_map %<>% select(-geometry) %>% as_tibble()
- How to KEEP only one data.frame from the R environment.
all_objects <- ls()
objects_to_remove <- setdiff(all_objects, "my_map")
rm(list = objects_to_remove, envir = .GlobalEnv)
2. Add an ID variable based on row number
my_map %<>% mutate(id = row_number())
3. Replace NAs across all the df with 0 (also using the assignment operator from magrittr pacakge)
my_map %<>%
mutate_all(~replace_na(., 0))
4. Plot missing variables in a data.frame
library(DataExplorer)
plot_missing(my_map)

5. Summarise all variables with skimr package
library(skimr)
skim(df)
6. Reverse score a variable
df %>%
mutate(reversed_score_var = max(score_var) + min(score_var) - score_var)
Lots and lots of stringr package stuff (and a bit of regex)
7. To remove footnote brackets – like [4] and [11] - from a string
df <- df %>%
mutate(column = str_replace_all(column, "\\[[0-9]+\\]", ""))
\\[ : Matches the opening square bracket
[0-9]+ : Matches one or more digits
\\]: Matches the closing square bracket
8. Remove a string pattern from all variables in a data.frame
my_map %<>%
rename_all(~str_remove(., "_map"))
9. How to extract s substring based on a pattern
my_map %<>%
mutate(my_pattern_substring = str_extract(my_string_variable, "my_pattern"))
10. To concatenate (link together) strings
str_c("a", "b", "c")
[1] "abc"
11. And how to compute the length of strings
str_length("abcdefme")
[1] 8
12. Extract substrings from a character variable
my_map %<>%
mutate(income_substring = substr(income_grp,
start = 1, stop = 3))
13. Split a string into pieces
str_split("a,b,c", ","str_split("Merry, Christmas, to, you", ",")
[[1]]
[1] "Merry" " Christmas" " to" " you"
14. Replace matched patterns in a string
my_map %<>%
mutate(earning_grp =str_replace(income_grp, "income", "earning"))
1 4. Lower middle earning 60 2 3. Upper middle earning 58 3 2. High earning: nonOECD 46 4 5. Low earning 42 5 1. High earning: OECD 35
15. Detect the presence or absence of a pattern
my_map %<>%
mutate(asia = str_detect(region_wb, "Asia"))
asia n
1 FALSE 127
2 TRUE 114
16. Count the number of occurrences of a pattern
oecd n
1 0 160
2 1 81
17. Trim leading and trailing whitespace
str_trim(" abc ")
Leaving stringr, back to other random code bits
18. Calculates the sum of values across all columns for each row in a data.frame
df %>% rowwise() %>%
mutate(sum = sum(c_across(everything())))
19. Using reduce() function from purrr package to iteratively combine elements in a vector
character_vector <- c("Good", "Will ", "Hunting")
reduce(character_vector, paste0)
20. Finding the maximum value in the disp column
reduce(mtcars$disp, pmax)
Map package code bits
21. Applying a summary function across variables in a data.frame
summary_stats_fun <- function(df, var, grouping_var) {
result <- df %>%
group_by({{ grouping_var }}) %>%
summarise(
count = n(),
sum_var = sum({{ var }}, na.rm = TRUE)
) %>%
arrange(desc(count))
return(result)
}
map_summary_stats <- function(list_of_data, var, grouping_var) {
result <- map(list_of_data, ~ summary_stats_fun(.x, var = var, grouping_var = grouping_var))
return(result)
}
list_of_data <- list(
data.frame(country = c("A", "B", "A", "C", "B", "C"), value = c(10, 15, 20, 5, 8, 12)),
data.frame(country = c("A", "A", "B", "B", "C", "C"), value = c(8, 12, 15, 10, 5, 20))
)
result_summary_stats <- map_summary_stats(list_of_data, var = "value", grouping_var = "country")
22. How to remove rows from a data.frame that match a string pattern
df <- df %>%
filter(!grepl("pattern", column))
23. How to remove non-numeric characters
df <- df %>%
mutate(column = str_replace_all(column, "[^0-9]", ""))
24. Removing parentheses and contents within
df <- df %>%
mutate(column = str_replace_all(column, "\\(.*?\\)", ""))
25. How to split a string into two new variables
df <- df %>%
separate(column, into = c("new_col_1", "new_col_2"), sep = ",")
26. Extracting alphabetic characters
df <- df %>%
mutate(alpha_chars = str_extract_all(column, "[A-Za-z]"))
27. Remove scientific notation
old_scipen <- options("scipen") # Save the current scipen value
options(scipen = 999) # Disable scientific notation
options(old_scipen) # Reset
Some functions from the forcats package for factor variables
28. Count the occurrences of each level in a factor
fct_count(factor_variable)
29. Order levels by their frequency.
fct_infreq(factor_variable)
30. Lump levels into a specified number of top or bottom levels.
fct_lump(factor_variable, n = 5)
31. Collapse factor levels into broader categories.
fct_collapse(factor_variable, new_levels = c("Category1", "Category2"))
32. Relabel factor levels
fct_relabel(factor_variable, new_labels = c("Label1", "Label2"))
33. Reverse the order of factor levels.
fct_rev(factor_variable)
34. Make NAs explicit by adding a level for missing values.
fct_explicit_na(factor_variable)
35. Group infrequent levels into “Other” category.
fct_other(factor_variable, keep = 5)
36. Create a cross-tabulation of two factors.
fct_cross(factor1, factor2)
37. Recode factor levels.
fct_recode(factor_variable, new_levels = c("NewLevel1" = "OldLevel1", "NewLevel2" = "OldLevel2"))
38. Count the occurrences of each level in a factor.
fct_count(factor_variable)
And next we will look at lubridate functions I always need to look up. Dates are a pain.
39. Parse date character in the “Year-Month-Day” format.
ymd("2023-12-16")
40. Get the current date and time.
now()
41. Get the current date only
today()
42. How to extract a year from a date class.
year(ymd("2023-12-16"))
43. And to extract the hour from the time now
hour(now())
44. How to extrac the day of the week from a data class
wday(ymd("2023-12-16"))
45. Rounding up or rounding down to the nearest time unit
floor_date(now(), "months")
ceiling_date(now(), "hours")
46. Create an interval object.
my_interval <- interval(start = ymd("2023-01-01"), end = ymd("2023-12-31"))
47. Get the timezone of date and time now
timezone(now())
And last the janitor package for cleaning variables
48. Clean names with lowercase letters and _underscores_
library(janitor)
cleaned_data <- clean_names(original_data)
49. Clean and remove away empty rows and columns from a data.frame
cleaned_data <- remove_empty(original_data)
50. How to remove columns with constant values
cleaned_data <- remove_constant(original_data)
51. How to find duplicate rows in a data.frame
duplicate_rows <- get_dupes(original_data, columns = c("col1", "col2"))
52. How to add percentage sign (%) to a contingency table
table_with_percentages <- tabyl(original_data, col1, col2) %>%
adorn_percentages("row")
53. Add row or column counts to a contingency table
table_with_counts <- tabyl(original_data, col1, col2) %>%
adorn_ns()
54: Changing Data Types
df %>%
mutate(across(starts_with("num"), as.character))
55. Scaling Numeric Variables
df %>%
mutate(across(where(is.numeric), scale))
56. Applying a Custom Function
df %>%
mutate(across(contains("price"), ~ .x * 1.1))
57. Filter based on a condition
df %>%
filter(across(ends_with("score"), ~ .x > 80))
58. Inpute a missing variable
df %>%
mutate(across(starts_with("var"), ~ifelse(is.na(.x), mean(.x, na.rm = TRUE), .x)))
59. Selecting Columns with Specific Data Types:
df %>%
select(across(where(is.character)))
60. Rename columns with paste()
df %>%
rename(across(contains("old"), ~ paste0("new_", .x)))
61. How to calculate row sums
df %>%
mutate(total = rowSums(across(starts_with("quantity"))))
62. Group-wise scaling
df %>%
group_by(category) %>%
mutate(across(starts_with("value"), scale))
63. How to conditionally mutate
df %>% mutate(across(starts_with("sales"), ~ ifelse(.x > 100, "High", "Low")))
64. Output tidy regression model without scientific notatio
plm(lead(dep_var, 2) ~ ind_var, index = c("country", "year"),
data = df) %>%
broom::tidy() %>%
arrange(desc(estimate)) %>%
mutate(across(c(estimate, std.error, statistic, p.value), ~sprintf("%.10f", .)))
print(n = 100)
65. How to choose the reference factor for a regression
df %<>%
mutate(factor_var = relevel(as.factor(factor_var), ref = "ref_level"))
66. Remove variables that end with a character string.
vdem %<>%
select(-ends_with("_sd"),
-ends_with("_codelow"),
-ends_with("_codehigh"),
-ends_with("_3C"),
-ends_with("_4C"),
-ends_with("_5C"))
Can you add more code snippets in the commets???


