Another blog I will make to have easy access to code snippets for my own record.
We will use an example with data from V-DEM.
Click here to read more about downloading the V-DEM dataset
string_vector <- c("_nr", "_codehigh", "_codelow", "_mean", "_sd", "_ord",
"_ord_codehigh", "_ord_codelow", "_osp", "_osp_codehigh",
"_osp_codelow", "_osp_sd")
pattern <- paste(string_vector, collapse = "|")
vdem %<>%
dplyr::select(-matches(pattern))
vdem %<>%
filter(year %in% c(1900:2022))
v2x_jucon: To what extent does the executive respect the constitution and comply with court rulings, and to what extent is the judiciary able to act in an independent fashion?
v2x_corr: : How pervasive is political corruption?
vdem %>%
group_by(country_name) %>%
summarise(avg_corr = mean(v2x_corr, na.rm = TRUE),
avg_judic = mean(v2x_jucon, na.rm = TRUE)) -> vdem_summarised
ggplot(aes(x = avg_corr,
y = avg_judic)) +
geom_point(alpha = 0.6) +
scale_y_continuous(labels = scales::label_comma())

First we need to model the variables in a linear regression
Then we create a residuals variable
And then we can find what residuals are two standard deviations from the OLS line
model <- lm(avg_judic ~ avg_corr, data = vdem_summarised)
vdem_summarised <- vdem_summarised %>%
mutate(residuals = resid(model))
residual_threshold <- 2 * sd(vdem_summarised$residuals)
Next we flag certain countries as outliers based on the model residuals
vdem_summarised <- vdem_summarised %>%
mutate(outlier = ifelse(abs(residuals) > residual_threshold, TRUE, FALSE))
And we plot it out
vdem_summarised %>%
ggplot(aes(x = avg_corr, y = avg_judic)) +
geom_smooth(color = "#003d5b",
method = "lm",
se = FALSE,
size = 3,
alpha = 0.2) +
geom_point(aes(color = outlier),
size = 4,
alpha = 0.6,
) +
scale_color_manual(values = c("FALSE" = "#00798c",
"TRUE" = "#c1121f")) +
ggrepel::geom_label_repel(
data = filter(vdem_summarised, outlier),
aes(label = country_name),
size = 3,
nudge_x = 0.1,
nudge_y = 0.1,
color = "#c1121f"
) +
theme_minimal() +
labs(title = "Judicial Independence vs. Political Corruption",
caption = "V-DEM average 1900 - 2020",
x = "Average Judicial Independence",
y = "Average Political Corruption") +
guides(color = guide_legend(override.aes = list(size = 4))) +
theme(text = element_text(size = 12), # Default text size for all text elements
plot.title = element_text(size = 20, face="bold"), # Plot title
axis.title = element_text(size = 16), # Axis titles (both x and y)
axis.text = element_text(size = 14), # Axis text (both x and y)
legend.title = element_text(size = 14), # Legend title
legend.text = element_text(size = 12)) # Legend items

First, we will calculate the inter-quartile range and define outliers for our y variable.
iqr <- IQR(df$y)
upper_threshold <- quantile(df$y, 0.75) + 1.5 * iqr
lower_threshold <- quantile(df$y, 0.25) - 1.5 * iqr
And we use that iqr data.frame and add it to the df
df <- df %>%
mutate(outlier = ifelse(y < lower_threshold | y > upper_threshold, TRUE, FALSE))
And we can graph the ggplot with the geom_text() only of the outliers.
ggplot(df, aes(x = x, y = y)) +
geom_point() +
ggrepel::geom_label_repel(data = filter(df, outlier), aes(label = country_name), nudge_y = 0.25) +
theme_minimal()

