2026-05-07 15:02:25 +02:00
|
|
|
|
# =============================================================================
|
|
|
|
|
|
# 01-sampling.R · 2022 cross-section for municipalities CA
|
|
|
|
|
|
# =============================================================================
|
|
|
|
|
|
#
|
|
|
|
|
|
# Takes the full dataset (1968–2026, 290 municipalities) and produces a single
|
|
|
|
|
|
# clean cross-section anchored to 2022.
|
|
|
|
|
|
#
|
|
|
|
|
|
# Each variable is assigned one of five actions:
|
|
|
|
|
|
#
|
|
|
|
|
|
# keep complete (or near-complete) in 2022; used as-is.
|
|
|
|
|
|
# Variables with a handful of NAs and no backfill data
|
|
|
|
|
|
# (e.g. election variables, 3-4 missing per cycle)
|
|
|
|
|
|
# are also kept; residual NAs are noted in the audit.
|
|
|
|
|
|
#
|
|
|
|
|
|
# backfill_window missing in 2022 (fully or partially), but data exist
|
|
|
|
|
|
# in [2020, 2021]: fill each municipality from its most
|
|
|
|
|
|
# recent non-NA value in that window.
|
|
|
|
|
|
#
|
|
|
|
|
|
# backfill_census 100% missing in 2022, but the series is still active
|
|
|
|
|
|
# (data exists AFTER 2022, i.e. a periodic/census
|
|
|
|
|
|
# variable such as EU parliament elections).
|
|
|
|
|
|
# Strategy: use the year closest to 2022, looking both
|
|
|
|
|
|
# backwards and forwards.
|
|
|
|
|
|
#
|
|
|
|
|
|
# backfill_disc 100% missing in 2022, and the series was discontinued
|
|
|
|
|
|
# before 2022 (data only exists in the past).
|
|
|
|
|
|
# Strategy: use the most recent available year for each
|
|
|
|
|
|
# municipality, regardless of how long ago it was.
|
|
|
|
|
|
#
|
|
|
|
|
|
# drop no data at all in the full panel; excluded.
|
|
|
|
|
|
#
|
|
|
|
|
|
# Outputs:
|
|
|
|
|
|
# data/processed/m_sample.rds : 290 municipalities × retained variables
|
|
|
|
|
|
# data/processed/sampling_audit.csv : variable-level audit log
|
|
|
|
|
|
|
|
|
|
|
|
library(tidyverse)
|
|
|
|
|
|
|
|
|
|
|
|
# 00-Load ----------------------------------------------------------------------
|
2026-05-07 14:32:28 +02:00
|
|
|
|
municipalities_raw <- read_rds("data/processed/m_raw.rds")
|
2026-05-07 15:02:25 +02:00
|
|
|
|
# Panel: 17 110 rows (290 municipalities × ~59 years), 257 columns.
|
|
|
|
|
|
|
|
|
|
|
|
# 01-Characterise each variable's availability ---------------------------------
|
|
|
|
|
|
# 1a. Full dataset: first and last year with any non-NA value.
|
|
|
|
|
|
availability_full <- municipalities_raw |>
|
|
|
|
|
|
pivot_longer(
|
|
|
|
|
|
cols = -c(year, code, municipality),
|
|
|
|
|
|
names_to = "variable",
|
|
|
|
|
|
values_to = "value"
|
|
|
|
|
|
) |>
|
|
|
|
|
|
filter(!is.na(value)) |>
|
|
|
|
|
|
summarise(
|
|
|
|
|
|
.by = variable,
|
|
|
|
|
|
first_year = min(year),
|
|
|
|
|
|
last_year = max(year)
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# 1b. Backfill window [2020, 2021]: the most recent year with any non-NA value.
|
|
|
|
|
|
# If a variable has nothing here, the window-backfill path is unavailable.
|
|
|
|
|
|
availability_window <- municipalities_raw |>
|
|
|
|
|
|
filter(between(year, 2020, 2021)) |>
|
|
|
|
|
|
pivot_longer(
|
|
|
|
|
|
cols = -c(year, code, municipality),
|
|
|
|
|
|
names_to = "variable",
|
|
|
|
|
|
values_to = "value"
|
|
|
|
|
|
) |>
|
|
|
|
|
|
filter(!is.na(value)) |>
|
|
|
|
|
|
summarise(
|
|
|
|
|
|
.by = variable,
|
|
|
|
|
|
window_last_year = max(year)
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# 02-Extract the 2022 slice and assess NAs -------------------------------------
|
|
|
|
|
|
m_2022 <- municipalities_raw |> filter(year == 2022)
|
|
|
|
|
|
n_munic <- nrow(m_2022) # 290
|
|
|
|
|
|
|
|
|
|
|
|
na_2022 <- m_2022 |>
|
|
|
|
|
|
summarise(across(-c(year, code, municipality), \(x) sum(is.na(x)))) |>
|
|
|
|
|
|
pivot_longer(everything(), names_to = "variable", values_to = "n_na") |>
|
|
|
|
|
|
mutate(pct_na = n_na / n_munic)
|
|
|
|
|
|
|
|
|
|
|
|
# 03-Classify every variable ---------------------------------------------------
|
|
|
|
|
|
variable_plan <- na_2022 |>
|
|
|
|
|
|
left_join(availability_full, by = "variable") |>
|
|
|
|
|
|
left_join(availability_window, by = "variable") |>
|
|
|
|
|
|
mutate(
|
|
|
|
|
|
action = case_when(
|
|
|
|
|
|
# Already fine in 2022 (including partial NAs with no window fill available)
|
|
|
|
|
|
n_na == 0 ~ "keep",
|
|
|
|
|
|
n_na > 0 & n_na < n_munic & is.na(window_last_year) ~ "keep",
|
|
|
|
|
|
|
|
|
|
|
|
# Gap in 2022 but window data available; standard window backfill
|
|
|
|
|
|
n_na > 0 & !is.na(window_last_year) ~ "backfill_window",
|
|
|
|
|
|
|
|
|
|
|
|
# Fully missing in 2022 series is still active (data after 2022); periodic
|
|
|
|
|
|
n_na == n_munic &
|
|
|
|
|
|
is.na(window_last_year) &
|
|
|
|
|
|
!is.na(last_year) &
|
|
|
|
|
|
last_year > 2022 ~ "backfill_census",
|
|
|
|
|
|
|
|
|
|
|
|
# Fully missing in 2022; series ended before 2022; discontinued
|
|
|
|
|
|
n_na == n_munic &
|
|
|
|
|
|
is.na(window_last_year) &
|
|
|
|
|
|
!is.na(last_year) &
|
|
|
|
|
|
last_year <= 2022 ~ "backfill_disc",
|
|
|
|
|
|
|
|
|
|
|
|
# No data anywhere in the dataset
|
|
|
|
|
|
is.na(last_year) ~ "drop"
|
|
|
|
|
|
),
|
|
|
|
|
|
# Sub-type for reporting
|
|
|
|
|
|
fill_type = case_when(
|
|
|
|
|
|
action == "backfill_window" & n_na == n_munic ~ "full_column",
|
|
|
|
|
|
action == "backfill_window" & n_na < n_munic ~ "partial",
|
|
|
|
|
|
action == "keep" & n_na > 0 ~ "residual_na",
|
|
|
|
|
|
action == "backfill_census" ~ "census_closest",
|
|
|
|
|
|
action == "backfill_disc" ~ "discontinued_last",
|
|
|
|
|
|
TRUE ~ NA_character_
|
|
|
|
|
|
)
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# 04-Backfill A: window [2020, 2021] Per-municipality, most recent non-NA value.
|
|
|
|
|
|
vars_window <- variable_plan |>
|
|
|
|
|
|
filter(action == "backfill_window") |>
|
|
|
|
|
|
pull(variable)
|
|
|
|
|
|
|
|
|
|
|
|
window_long <- municipalities_raw |>
|
|
|
|
|
|
filter(between(year, 2020, 2021)) |>
|
|
|
|
|
|
select(year, code, all_of(vars_window)) |>
|
|
|
|
|
|
pivot_longer(
|
|
|
|
|
|
all_of(vars_window),
|
|
|
|
|
|
names_to = "variable",
|
|
|
|
|
|
values_to = "value"
|
|
|
|
|
|
) |>
|
|
|
|
|
|
filter(!is.na(value)) |>
|
|
|
|
|
|
group_by(code, variable) |>
|
|
|
|
|
|
slice_max(year, n = 1, with_ties = FALSE) |>
|
|
|
|
|
|
ungroup()
|
|
|
|
|
|
|
|
|
|
|
|
window_source_year <- window_long |>
|
|
|
|
|
|
group_by(variable) |>
|
|
|
|
|
|
summarise(source_year = max(year), .groups = "drop")
|
|
|
|
|
|
|
|
|
|
|
|
window_wide <- window_long |>
|
|
|
|
|
|
select(-year) |>
|
|
|
|
|
|
pivot_wider(names_from = variable, values_from = value)
|
|
|
|
|
|
|
|
|
|
|
|
# 05-Backfill B: census / periodic Per-municipality, year closest to 2022 in
|
|
|
|
|
|
# either direction.
|
|
|
|
|
|
# (Applies to EU parliament election variables: 2024 is 2 years away,
|
|
|
|
|
|
# 2019 is 3 years away, so 2024 will be selected for all municipalities.)
|
|
|
|
|
|
# ------------------------------------------------------------------------------
|
|
|
|
|
|
vars_census <- variable_plan |>
|
|
|
|
|
|
filter(action == "backfill_census") |>
|
|
|
|
|
|
pull(variable)
|
|
|
|
|
|
|
|
|
|
|
|
census_long <- municipalities_raw |>
|
|
|
|
|
|
filter(year != 2022) |>
|
|
|
|
|
|
select(year, code, all_of(vars_census)) |>
|
|
|
|
|
|
pivot_longer(
|
|
|
|
|
|
all_of(vars_census),
|
|
|
|
|
|
names_to = "variable",
|
|
|
|
|
|
values_to = "value"
|
|
|
|
|
|
) |>
|
|
|
|
|
|
filter(!is.na(value)) |>
|
|
|
|
|
|
mutate(distance = abs(year - 2022)) |>
|
|
|
|
|
|
group_by(code, variable) |>
|
|
|
|
|
|
slice_min(distance, n = 1, with_ties = FALSE) |>
|
|
|
|
|
|
ungroup()
|
|
|
|
|
|
|
|
|
|
|
|
census_source_year <- census_long |>
|
|
|
|
|
|
group_by(variable) |>
|
|
|
|
|
|
summarise(source_year = max(year), .groups = "drop")
|
|
|
|
|
|
|
|
|
|
|
|
census_wide <- census_long |>
|
|
|
|
|
|
select(-year, -distance) |>
|
|
|
|
|
|
pivot_wider(names_from = variable, values_from = value)
|
|
|
|
|
|
|
|
|
|
|
|
# 06-Backfill C: discontinued --------------------------------------------------
|
|
|
|
|
|
vars_disc <- variable_plan |>
|
|
|
|
|
|
filter(action == "backfill_disc") |>
|
|
|
|
|
|
pull(variable)
|
|
|
|
|
|
|
|
|
|
|
|
disc_long <- municipalities_raw |>
|
|
|
|
|
|
filter(year < 2022) |>
|
|
|
|
|
|
select(year, code, all_of(vars_disc)) |>
|
|
|
|
|
|
pivot_longer(all_of(vars_disc), names_to = "variable", values_to = "value") |>
|
|
|
|
|
|
filter(!is.na(value)) |>
|
|
|
|
|
|
group_by(code, variable) |>
|
|
|
|
|
|
slice_max(year, n = 1, with_ties = FALSE) |>
|
|
|
|
|
|
ungroup()
|
|
|
|
|
|
|
|
|
|
|
|
disc_source_year <- disc_long |>
|
|
|
|
|
|
group_by(variable) |>
|
|
|
|
|
|
summarise(source_year = max(year), .groups = "drop")
|
|
|
|
|
|
|
|
|
|
|
|
disc_wide <- disc_long |>
|
|
|
|
|
|
select(-year) |>
|
|
|
|
|
|
pivot_wider(names_from = variable, values_from = value)
|
|
|
|
|
|
|
|
|
|
|
|
# 07-Apply all fills (window first, then census and disc which only touch the
|
|
|
|
|
|
# still-NA cells, i.e. the 100%-missing variables ------------------------------
|
|
|
|
|
|
m_2022_filled <- m_2022 |>
|
|
|
|
|
|
rows_patch(window_wide, by = "code", unmatched = "ignore") |>
|
|
|
|
|
|
rows_patch(census_wide, by = "code", unmatched = "ignore") |>
|
|
|
|
|
|
rows_patch(disc_wide, by = "code", unmatched = "ignore")
|
|
|
|
|
|
|
|
|
|
|
|
# 08-Remove truly empty variables and the redundant year column ----------------
|
|
|
|
|
|
vars_drop <- variable_plan |> filter(action == "drop") |> pull(variable)
|
|
|
|
|
|
|
|
|
|
|
|
m_sample <- m_2022_filled |>
|
|
|
|
|
|
select(-all_of(vars_drop), -year)
|
|
|
|
|
|
|
|
|
|
|
|
# Check residual NAs
|
|
|
|
|
|
na_remaining <- m_sample |>
|
|
|
|
|
|
summarise(across(-c(code, municipality), \(x) sum(is.na(x)))) |>
|
|
|
|
|
|
pivot_longer(everything(), names_to = "variable", values_to = "n_na_final") |>
|
|
|
|
|
|
filter(n_na_final > 0)
|
|
|
|
|
|
|
|
|
|
|
|
# 09-Audit ---------------------------------------------------------------------
|
|
|
|
|
|
# Combine source-year info from all three fill paths
|
|
|
|
|
|
source_years <- bind_rows(
|
|
|
|
|
|
window_source_year,
|
|
|
|
|
|
census_source_year,
|
|
|
|
|
|
disc_source_year
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
sampling_audit <- variable_plan |>
|
|
|
|
|
|
left_join(source_years, by = "variable") |>
|
|
|
|
|
|
left_join(na_remaining, by = "variable") |>
|
|
|
|
|
|
mutate(
|
|
|
|
|
|
n_na_final = replace_na(n_na_final, 0L),
|
|
|
|
|
|
years_from_2022 = if_else(
|
|
|
|
|
|
!is.na(source_year),
|
|
|
|
|
|
abs(source_year - 2022L),
|
|
|
|
|
|
NA_integer_
|
|
|
|
|
|
),
|
|
|
|
|
|
note = case_when(
|
|
|
|
|
|
action == "keep" & is.na(fill_type) ~
|
|
|
|
|
|
"Complete in 2022",
|
|
|
|
|
|
|
|
|
|
|
|
fill_type == "residual_na" ~
|
|
|
|
|
|
paste0(
|
|
|
|
|
|
n_na,
|
|
|
|
|
|
" municipalities (",
|
|
|
|
|
|
round(pct_na * 100, 1),
|
|
|
|
|
|
"%) have no 2022 value ",
|
|
|
|
|
|
"and no [2020, 2021] data (variable measured only in specific years, e.g. elections)"
|
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
|
|
action == "drop" ~
|
|
|
|
|
|
"Dropped — no data anywhere in the panel",
|
|
|
|
|
|
|
|
|
|
|
|
fill_type == "full_column" ~
|
|
|
|
|
|
paste0(
|
|
|
|
|
|
"Entire column backfilled from ",
|
|
|
|
|
|
source_year,
|
|
|
|
|
|
" (100% missing in 2022)"
|
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
|
|
fill_type == "partial" ~
|
|
|
|
|
|
paste0(
|
|
|
|
|
|
round(pct_na * 100, 1),
|
|
|
|
|
|
"% missing in 2022; ",
|
|
|
|
|
|
"per-municipality fill from ≤",
|
|
|
|
|
|
source_year,
|
|
|
|
|
|
if_else(
|
|
|
|
|
|
n_na_final > 0,
|
|
|
|
|
|
paste0("; ", n_na_final, " municipalities remain NA"),
|
|
|
|
|
|
"; fully resolved"
|
|
|
|
|
|
)
|
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
|
|
fill_type == "census_closest" ~
|
|
|
|
|
|
paste0(
|
|
|
|
|
|
"Periodic variable — no data in 2022; backfilled from ",
|
|
|
|
|
|
source_year,
|
|
|
|
|
|
" (",
|
|
|
|
|
|
years_from_2022,
|
|
|
|
|
|
" years from anchor)"
|
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
|
|
fill_type == "discontinued_last" ~
|
|
|
|
|
|
paste0(
|
|
|
|
|
|
"Series discontinued; backfilled from most recent available year (",
|
|
|
|
|
|
source_year,
|
|
|
|
|
|
", ",
|
|
|
|
|
|
years_from_2022,
|
|
|
|
|
|
" years from anchor)"
|
|
|
|
|
|
)
|
|
|
|
|
|
)
|
|
|
|
|
|
) |>
|
|
|
|
|
|
select(
|
|
|
|
|
|
variable,
|
|
|
|
|
|
action,
|
|
|
|
|
|
fill_type,
|
|
|
|
|
|
n_na_2022 = n_na,
|
|
|
|
|
|
pct_na_2022 = pct_na,
|
|
|
|
|
|
n_na_final,
|
|
|
|
|
|
first_year,
|
|
|
|
|
|
last_year,
|
|
|
|
|
|
source_year,
|
|
|
|
|
|
years_from_2022,
|
|
|
|
|
|
note
|
|
|
|
|
|
) |>
|
|
|
|
|
|
arrange(action, fill_type, desc(years_from_2022))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 10-Save ----------------------------------------------------------------------
|
|
|
|
|
|
write_rds(m_sample, "data/processed/m_sample.rds")
|
|
|
|
|
|
write_csv(sampling_audit, "data/processed/sampling_audit.csv")
|