ruralitic-qrm/src/municipalities/01-sampling.R
2026-05-07 15:02:25 +02:00

309 lines
10 KiB
R
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# =============================================================================
# 01-sampling.R · 2022 cross-section for municipalities CA
# =============================================================================
#
# Takes the full dataset (19682026, 290 municipalities) and produces a single
# clean cross-section anchored to 2022.
#
# Each variable is assigned one of five actions:
#
# keep complete (or near-complete) in 2022; used as-is.
# Variables with a handful of NAs and no backfill data
# (e.g. election variables, 3-4 missing per cycle)
# are also kept; residual NAs are noted in the audit.
#
# backfill_window missing in 2022 (fully or partially), but data exist
# in [2020, 2021]: fill each municipality from its most
# recent non-NA value in that window.
#
# backfill_census 100% missing in 2022, but the series is still active
# (data exists AFTER 2022, i.e. a periodic/census
# variable such as EU parliament elections).
# Strategy: use the year closest to 2022, looking both
# backwards and forwards.
#
# backfill_disc 100% missing in 2022, and the series was discontinued
# before 2022 (data only exists in the past).
# Strategy: use the most recent available year for each
# municipality, regardless of how long ago it was.
#
# drop no data at all in the full panel; excluded.
#
# Outputs:
# data/processed/m_sample.rds : 290 municipalities × retained variables
# data/processed/sampling_audit.csv : variable-level audit log
library(tidyverse)
# 00-Load ----------------------------------------------------------------------
municipalities_raw <- read_rds("data/processed/m_raw.rds")
# Panel: 17 110 rows (290 municipalities × ~59 years), 257 columns.
# 01-Characterise each variable's availability ---------------------------------
# 1a. Full dataset: first and last year with any non-NA value.
availability_full <- municipalities_raw |>
pivot_longer(
cols = -c(year, code, municipality),
names_to = "variable",
values_to = "value"
) |>
filter(!is.na(value)) |>
summarise(
.by = variable,
first_year = min(year),
last_year = max(year)
)
# 1b. Backfill window [2020, 2021]: the most recent year with any non-NA value.
# If a variable has nothing here, the window-backfill path is unavailable.
availability_window <- municipalities_raw |>
filter(between(year, 2020, 2021)) |>
pivot_longer(
cols = -c(year, code, municipality),
names_to = "variable",
values_to = "value"
) |>
filter(!is.na(value)) |>
summarise(
.by = variable,
window_last_year = max(year)
)
# 02-Extract the 2022 slice and assess NAs -------------------------------------
m_2022 <- municipalities_raw |> filter(year == 2022)
n_munic <- nrow(m_2022) # 290
na_2022 <- m_2022 |>
summarise(across(-c(year, code, municipality), \(x) sum(is.na(x)))) |>
pivot_longer(everything(), names_to = "variable", values_to = "n_na") |>
mutate(pct_na = n_na / n_munic)
# 03-Classify every variable ---------------------------------------------------
variable_plan <- na_2022 |>
left_join(availability_full, by = "variable") |>
left_join(availability_window, by = "variable") |>
mutate(
action = case_when(
# Already fine in 2022 (including partial NAs with no window fill available)
n_na == 0 ~ "keep",
n_na > 0 & n_na < n_munic & is.na(window_last_year) ~ "keep",
# Gap in 2022 but window data available; standard window backfill
n_na > 0 & !is.na(window_last_year) ~ "backfill_window",
# Fully missing in 2022 series is still active (data after 2022); periodic
n_na == n_munic &
is.na(window_last_year) &
!is.na(last_year) &
last_year > 2022 ~ "backfill_census",
# Fully missing in 2022; series ended before 2022; discontinued
n_na == n_munic &
is.na(window_last_year) &
!is.na(last_year) &
last_year <= 2022 ~ "backfill_disc",
# No data anywhere in the dataset
is.na(last_year) ~ "drop"
),
# Sub-type for reporting
fill_type = case_when(
action == "backfill_window" & n_na == n_munic ~ "full_column",
action == "backfill_window" & n_na < n_munic ~ "partial",
action == "keep" & n_na > 0 ~ "residual_na",
action == "backfill_census" ~ "census_closest",
action == "backfill_disc" ~ "discontinued_last",
TRUE ~ NA_character_
)
)
# 04-Backfill A: window [2020, 2021] Per-municipality, most recent non-NA value.
vars_window <- variable_plan |>
filter(action == "backfill_window") |>
pull(variable)
window_long <- municipalities_raw |>
filter(between(year, 2020, 2021)) |>
select(year, code, all_of(vars_window)) |>
pivot_longer(
all_of(vars_window),
names_to = "variable",
values_to = "value"
) |>
filter(!is.na(value)) |>
group_by(code, variable) |>
slice_max(year, n = 1, with_ties = FALSE) |>
ungroup()
window_source_year <- window_long |>
group_by(variable) |>
summarise(source_year = max(year), .groups = "drop")
window_wide <- window_long |>
select(-year) |>
pivot_wider(names_from = variable, values_from = value)
# 05-Backfill B: census / periodic Per-municipality, year closest to 2022 in
# either direction.
# (Applies to EU parliament election variables: 2024 is 2 years away,
# 2019 is 3 years away, so 2024 will be selected for all municipalities.)
# ------------------------------------------------------------------------------
vars_census <- variable_plan |>
filter(action == "backfill_census") |>
pull(variable)
census_long <- municipalities_raw |>
filter(year != 2022) |>
select(year, code, all_of(vars_census)) |>
pivot_longer(
all_of(vars_census),
names_to = "variable",
values_to = "value"
) |>
filter(!is.na(value)) |>
mutate(distance = abs(year - 2022)) |>
group_by(code, variable) |>
slice_min(distance, n = 1, with_ties = FALSE) |>
ungroup()
census_source_year <- census_long |>
group_by(variable) |>
summarise(source_year = max(year), .groups = "drop")
census_wide <- census_long |>
select(-year, -distance) |>
pivot_wider(names_from = variable, values_from = value)
# 06-Backfill C: discontinued --------------------------------------------------
vars_disc <- variable_plan |>
filter(action == "backfill_disc") |>
pull(variable)
disc_long <- municipalities_raw |>
filter(year < 2022) |>
select(year, code, all_of(vars_disc)) |>
pivot_longer(all_of(vars_disc), names_to = "variable", values_to = "value") |>
filter(!is.na(value)) |>
group_by(code, variable) |>
slice_max(year, n = 1, with_ties = FALSE) |>
ungroup()
disc_source_year <- disc_long |>
group_by(variable) |>
summarise(source_year = max(year), .groups = "drop")
disc_wide <- disc_long |>
select(-year) |>
pivot_wider(names_from = variable, values_from = value)
# 07-Apply all fills (window first, then census and disc which only touch the
# still-NA cells, i.e. the 100%-missing variables ------------------------------
m_2022_filled <- m_2022 |>
rows_patch(window_wide, by = "code", unmatched = "ignore") |>
rows_patch(census_wide, by = "code", unmatched = "ignore") |>
rows_patch(disc_wide, by = "code", unmatched = "ignore")
# 08-Remove truly empty variables and the redundant year column ----------------
vars_drop <- variable_plan |> filter(action == "drop") |> pull(variable)
m_sample <- m_2022_filled |>
select(-all_of(vars_drop), -year)
# Check residual NAs
na_remaining <- m_sample |>
summarise(across(-c(code, municipality), \(x) sum(is.na(x)))) |>
pivot_longer(everything(), names_to = "variable", values_to = "n_na_final") |>
filter(n_na_final > 0)
# 09-Audit ---------------------------------------------------------------------
# Combine source-year info from all three fill paths
source_years <- bind_rows(
window_source_year,
census_source_year,
disc_source_year
)
sampling_audit <- variable_plan |>
left_join(source_years, by = "variable") |>
left_join(na_remaining, by = "variable") |>
mutate(
n_na_final = replace_na(n_na_final, 0L),
years_from_2022 = if_else(
!is.na(source_year),
abs(source_year - 2022L),
NA_integer_
),
note = case_when(
action == "keep" & is.na(fill_type) ~
"Complete in 2022",
fill_type == "residual_na" ~
paste0(
n_na,
" municipalities (",
round(pct_na * 100, 1),
"%) have no 2022 value ",
"and no [2020, 2021] data (variable measured only in specific years, e.g. elections)"
),
action == "drop" ~
"Dropped — no data anywhere in the panel",
fill_type == "full_column" ~
paste0(
"Entire column backfilled from ",
source_year,
" (100% missing in 2022)"
),
fill_type == "partial" ~
paste0(
round(pct_na * 100, 1),
"% missing in 2022; ",
"per-municipality fill from ≤",
source_year,
if_else(
n_na_final > 0,
paste0("; ", n_na_final, " municipalities remain NA"),
"; fully resolved"
)
),
fill_type == "census_closest" ~
paste0(
"Periodic variable — no data in 2022; backfilled from ",
source_year,
" (",
years_from_2022,
" years from anchor)"
),
fill_type == "discontinued_last" ~
paste0(
"Series discontinued; backfilled from most recent available year (",
source_year,
", ",
years_from_2022,
" years from anchor)"
)
)
) |>
select(
variable,
action,
fill_type,
n_na_2022 = n_na,
pct_na_2022 = pct_na,
n_na_final,
first_year,
last_year,
source_year,
years_from_2022,
note
) |>
arrange(action, fill_type, desc(years_from_2022))
# 10-Save ----------------------------------------------------------------------
write_rds(m_sample, "data/processed/m_sample.rds")
write_csv(sampling_audit, "data/processed/sampling_audit.csv")