ruralitic-qrm/src/municipalities/01-sampling.R

310 lines
10 KiB
R
Raw Normal View History

2026-05-07 15:02:25 +02:00
# =============================================================================
# 01-sampling.R · 2022 cross-section for municipalities CA
# =============================================================================
#
# Takes the full dataset (19682026, 290 municipalities) and produces a single
# clean cross-section anchored to 2022.
#
# Each variable is assigned one of five actions:
#
# keep complete (or near-complete) in 2022; used as-is.
# Variables with a handful of NAs and no backfill data
# (e.g. election variables, 3-4 missing per cycle)
# are also kept; residual NAs are noted in the audit.
#
# backfill_window missing in 2022 (fully or partially), but data exist
# in [2020, 2021]: fill each municipality from its most
# recent non-NA value in that window.
#
# backfill_census 100% missing in 2022, but the series is still active
# (data exists AFTER 2022, i.e. a periodic/census
# variable such as EU parliament elections).
# Strategy: use the year closest to 2022, looking both
# backwards and forwards.
#
# backfill_disc 100% missing in 2022, and the series was discontinued
# before 2022 (data only exists in the past).
# Strategy: use the most recent available year for each
# municipality, regardless of how long ago it was.
#
# drop no data at all in the full panel; excluded.
#
# Outputs:
# data/processed/m_sample.rds : 290 municipalities × retained variables
# data/processed/sampling_audit.csv : variable-level audit log
library(tidyverse)
# 00-Load ----------------------------------------------------------------------
2026-05-07 14:32:28 +02:00
municipalities_raw <- read_rds("data/processed/m_raw.rds")
2026-05-07 15:02:25 +02:00
# Panel: 17 110 rows (290 municipalities × ~59 years), 257 columns.
# 01-Characterise each variable's availability ---------------------------------
# 1a. Full dataset: first and last year with any non-NA value.
availability_full <- municipalities_raw |>
pivot_longer(
cols = -c(year, code, municipality),
names_to = "variable",
values_to = "value"
) |>
filter(!is.na(value)) |>
summarise(
.by = variable,
first_year = min(year),
last_year = max(year)
)
# 1b. Backfill window [2020, 2021]: the most recent year with any non-NA value.
# If a variable has nothing here, the window-backfill path is unavailable.
availability_window <- municipalities_raw |>
filter(between(year, 2020, 2021)) |>
pivot_longer(
cols = -c(year, code, municipality),
names_to = "variable",
values_to = "value"
) |>
filter(!is.na(value)) |>
summarise(
.by = variable,
window_last_year = max(year)
)
# 02-Extract the 2022 slice and assess NAs -------------------------------------
m_2022 <- municipalities_raw |> filter(year == 2022)
n_munic <- nrow(m_2022) # 290
na_2022 <- m_2022 |>
summarise(across(-c(year, code, municipality), \(x) sum(is.na(x)))) |>
pivot_longer(everything(), names_to = "variable", values_to = "n_na") |>
mutate(pct_na = n_na / n_munic)
# 03-Classify every variable ---------------------------------------------------
variable_plan <- na_2022 |>
left_join(availability_full, by = "variable") |>
left_join(availability_window, by = "variable") |>
mutate(
action = case_when(
# Already fine in 2022 (including partial NAs with no window fill available)
n_na == 0 ~ "keep",
n_na > 0 & n_na < n_munic & is.na(window_last_year) ~ "keep",
# Gap in 2022 but window data available; standard window backfill
n_na > 0 & !is.na(window_last_year) ~ "backfill_window",
# Fully missing in 2022 series is still active (data after 2022); periodic
n_na == n_munic &
is.na(window_last_year) &
!is.na(last_year) &
last_year > 2022 ~ "backfill_census",
# Fully missing in 2022; series ended before 2022; discontinued
n_na == n_munic &
is.na(window_last_year) &
!is.na(last_year) &
last_year <= 2022 ~ "backfill_disc",
# No data anywhere in the dataset
is.na(last_year) ~ "drop"
),
# Sub-type for reporting
fill_type = case_when(
action == "backfill_window" & n_na == n_munic ~ "full_column",
action == "backfill_window" & n_na < n_munic ~ "partial",
action == "keep" & n_na > 0 ~ "residual_na",
action == "backfill_census" ~ "census_closest",
action == "backfill_disc" ~ "discontinued_last",
TRUE ~ NA_character_
)
)
# 04-Backfill A: window [2020, 2021] Per-municipality, most recent non-NA value.
vars_window <- variable_plan |>
filter(action == "backfill_window") |>
pull(variable)
window_long <- municipalities_raw |>
filter(between(year, 2020, 2021)) |>
select(year, code, all_of(vars_window)) |>
pivot_longer(
all_of(vars_window),
names_to = "variable",
values_to = "value"
) |>
filter(!is.na(value)) |>
group_by(code, variable) |>
slice_max(year, n = 1, with_ties = FALSE) |>
ungroup()
window_source_year <- window_long |>
group_by(variable) |>
summarise(source_year = max(year), .groups = "drop")
window_wide <- window_long |>
select(-year) |>
pivot_wider(names_from = variable, values_from = value)
# 05-Backfill B: census / periodic Per-municipality, year closest to 2022 in
# either direction.
# (Applies to EU parliament election variables: 2024 is 2 years away,
# 2019 is 3 years away, so 2024 will be selected for all municipalities.)
# ------------------------------------------------------------------------------
vars_census <- variable_plan |>
filter(action == "backfill_census") |>
pull(variable)
census_long <- municipalities_raw |>
filter(year != 2022) |>
select(year, code, all_of(vars_census)) |>
pivot_longer(
all_of(vars_census),
names_to = "variable",
values_to = "value"
) |>
filter(!is.na(value)) |>
mutate(distance = abs(year - 2022)) |>
group_by(code, variable) |>
slice_min(distance, n = 1, with_ties = FALSE) |>
ungroup()
census_source_year <- census_long |>
group_by(variable) |>
summarise(source_year = max(year), .groups = "drop")
census_wide <- census_long |>
select(-year, -distance) |>
pivot_wider(names_from = variable, values_from = value)
# 06-Backfill C: discontinued --------------------------------------------------
vars_disc <- variable_plan |>
filter(action == "backfill_disc") |>
pull(variable)
disc_long <- municipalities_raw |>
filter(year < 2022) |>
select(year, code, all_of(vars_disc)) |>
pivot_longer(all_of(vars_disc), names_to = "variable", values_to = "value") |>
filter(!is.na(value)) |>
group_by(code, variable) |>
slice_max(year, n = 1, with_ties = FALSE) |>
ungroup()
disc_source_year <- disc_long |>
group_by(variable) |>
summarise(source_year = max(year), .groups = "drop")
disc_wide <- disc_long |>
select(-year) |>
pivot_wider(names_from = variable, values_from = value)
# 07-Apply all fills (window first, then census and disc which only touch the
# still-NA cells, i.e. the 100%-missing variables ------------------------------
m_2022_filled <- m_2022 |>
rows_patch(window_wide, by = "code", unmatched = "ignore") |>
rows_patch(census_wide, by = "code", unmatched = "ignore") |>
rows_patch(disc_wide, by = "code", unmatched = "ignore")
# 08-Remove truly empty variables and the redundant year column ----------------
vars_drop <- variable_plan |> filter(action == "drop") |> pull(variable)
m_sample <- m_2022_filled |>
select(-all_of(vars_drop), -year)
# Check residual NAs
na_remaining <- m_sample |>
summarise(across(-c(code, municipality), \(x) sum(is.na(x)))) |>
pivot_longer(everything(), names_to = "variable", values_to = "n_na_final") |>
filter(n_na_final > 0)
# 09-Audit ---------------------------------------------------------------------
# Combine source-year info from all three fill paths
source_years <- bind_rows(
window_source_year,
census_source_year,
disc_source_year
)
sampling_audit <- variable_plan |>
left_join(source_years, by = "variable") |>
left_join(na_remaining, by = "variable") |>
mutate(
n_na_final = replace_na(n_na_final, 0L),
years_from_2022 = if_else(
!is.na(source_year),
abs(source_year - 2022L),
NA_integer_
),
note = case_when(
action == "keep" & is.na(fill_type) ~
"Complete in 2022",
fill_type == "residual_na" ~
paste0(
n_na,
" municipalities (",
round(pct_na * 100, 1),
"%) have no 2022 value ",
"and no [2020, 2021] data (variable measured only in specific years, e.g. elections)"
),
action == "drop" ~
"Dropped — no data anywhere in the panel",
fill_type == "full_column" ~
paste0(
"Entire column backfilled from ",
source_year,
" (100% missing in 2022)"
),
fill_type == "partial" ~
paste0(
round(pct_na * 100, 1),
"% missing in 2022; ",
"per-municipality fill from ≤",
source_year,
if_else(
n_na_final > 0,
paste0("; ", n_na_final, " municipalities remain NA"),
"; fully resolved"
)
),
fill_type == "census_closest" ~
paste0(
"Periodic variable — no data in 2022; backfilled from ",
source_year,
" (",
years_from_2022,
" years from anchor)"
),
fill_type == "discontinued_last" ~
paste0(
"Series discontinued; backfilled from most recent available year (",
source_year,
", ",
years_from_2022,
" years from anchor)"
)
)
) |>
select(
variable,
action,
fill_type,
n_na_2022 = n_na,
pct_na_2022 = pct_na,
n_na_final,
first_year,
last_year,
source_year,
years_from_2022,
note
) |>
arrange(action, fill_type, desc(years_from_2022))
# 10-Save ----------------------------------------------------------------------
write_rds(m_sample, "data/processed/m_sample.rds")
write_csv(sampling_audit, "data/processed/sampling_audit.csv")