# ============================================================================= # 01-sampling.R · 2022 cross-section for municipalities CA # ============================================================================= # # Takes the full dataset (1968–2026, 290 municipalities) and produces a single # clean cross-section anchored to 2022. # # Each variable is assigned one of five actions: # # keep complete (or near-complete) in 2022; used as-is. # Variables with a handful of NAs and no backfill data # (e.g. election variables, 3-4 missing per cycle) # are also kept; residual NAs are noted in the audit. # # backfill_window missing in 2022 (fully or partially), but data exist # in [2020, 2021]: fill each municipality from its most # recent non-NA value in that window. # # backfill_census 100% missing in 2022, but the series is still active # (data exists AFTER 2022, i.e. a periodic/census # variable such as EU parliament elections). # Strategy: use the year closest to 2022, looking both # backwards and forwards. # # backfill_disc 100% missing in 2022, and the series was discontinued # before 2022 (data only exists in the past). # Strategy: use the most recent available year for each # municipality, regardless of how long ago it was. # # drop no data at all in the full panel; excluded. # # Outputs: # data/processed/m_sample.rds : 290 municipalities × retained variables # data/processed/sampling_audit.csv : variable-level audit log library(tidyverse) # 00-Load ---------------------------------------------------------------------- municipalities_raw <- read_rds("data/processed/m_raw.rds") # Panel: 17 110 rows (290 municipalities × ~59 years), 257 columns. # 01-Characterise each variable's availability --------------------------------- # 1a. Full dataset: first and last year with any non-NA value. availability_full <- municipalities_raw |> pivot_longer( cols = -c(year, code, municipality), names_to = "variable", values_to = "value" ) |> filter(!is.na(value)) |> summarise( .by = variable, first_year = min(year), last_year = max(year) ) # 1b. Backfill window [2020, 2021]: the most recent year with any non-NA value. # If a variable has nothing here, the window-backfill path is unavailable. availability_window <- municipalities_raw |> filter(between(year, 2020, 2021)) |> pivot_longer( cols = -c(year, code, municipality), names_to = "variable", values_to = "value" ) |> filter(!is.na(value)) |> summarise( .by = variable, window_last_year = max(year) ) # 02-Extract the 2022 slice and assess NAs ------------------------------------- m_2022 <- municipalities_raw |> filter(year == 2022) n_munic <- nrow(m_2022) # 290 na_2022 <- m_2022 |> summarise(across(-c(year, code, municipality), \(x) sum(is.na(x)))) |> pivot_longer(everything(), names_to = "variable", values_to = "n_na") |> mutate(pct_na = n_na / n_munic) # 03-Classify every variable --------------------------------------------------- variable_plan <- na_2022 |> left_join(availability_full, by = "variable") |> left_join(availability_window, by = "variable") |> mutate( action = case_when( # Already fine in 2022 (including partial NAs with no window fill available) n_na == 0 ~ "keep", n_na > 0 & n_na < n_munic & is.na(window_last_year) ~ "keep", # Gap in 2022 but window data available; standard window backfill n_na > 0 & !is.na(window_last_year) ~ "backfill_window", # Fully missing in 2022 series is still active (data after 2022); periodic n_na == n_munic & is.na(window_last_year) & !is.na(last_year) & last_year > 2022 ~ "backfill_census", # Fully missing in 2022; series ended before 2022; discontinued n_na == n_munic & is.na(window_last_year) & !is.na(last_year) & last_year <= 2022 ~ "backfill_disc", # No data anywhere in the dataset is.na(last_year) ~ "drop" ), # Sub-type for reporting fill_type = case_when( action == "backfill_window" & n_na == n_munic ~ "full_column", action == "backfill_window" & n_na < n_munic ~ "partial", action == "keep" & n_na > 0 ~ "residual_na", action == "backfill_census" ~ "census_closest", action == "backfill_disc" ~ "discontinued_last", TRUE ~ NA_character_ ) ) # 04-Backfill A: window [2020, 2021] Per-municipality, most recent non-NA value. vars_window <- variable_plan |> filter(action == "backfill_window") |> pull(variable) window_long <- municipalities_raw |> filter(between(year, 2020, 2021)) |> select(year, code, all_of(vars_window)) |> pivot_longer( all_of(vars_window), names_to = "variable", values_to = "value" ) |> filter(!is.na(value)) |> group_by(code, variable) |> slice_max(year, n = 1, with_ties = FALSE) |> ungroup() window_source_year <- window_long |> group_by(variable) |> summarise(source_year = max(year), .groups = "drop") window_wide <- window_long |> select(-year) |> pivot_wider(names_from = variable, values_from = value) # 05-Backfill B: census / periodic Per-municipality, year closest to 2022 in # either direction. # (Applies to EU parliament election variables: 2024 is 2 years away, # 2019 is 3 years away, so 2024 will be selected for all municipalities.) # ------------------------------------------------------------------------------ vars_census <- variable_plan |> filter(action == "backfill_census") |> pull(variable) census_long <- municipalities_raw |> filter(year != 2022) |> select(year, code, all_of(vars_census)) |> pivot_longer( all_of(vars_census), names_to = "variable", values_to = "value" ) |> filter(!is.na(value)) |> mutate(distance = abs(year - 2022)) |> group_by(code, variable) |> slice_min(distance, n = 1, with_ties = FALSE) |> ungroup() census_source_year <- census_long |> group_by(variable) |> summarise(source_year = max(year), .groups = "drop") census_wide <- census_long |> select(-year, -distance) |> pivot_wider(names_from = variable, values_from = value) # 06-Backfill C: discontinued -------------------------------------------------- vars_disc <- variable_plan |> filter(action == "backfill_disc") |> pull(variable) disc_long <- municipalities_raw |> filter(year < 2022) |> select(year, code, all_of(vars_disc)) |> pivot_longer(all_of(vars_disc), names_to = "variable", values_to = "value") |> filter(!is.na(value)) |> group_by(code, variable) |> slice_max(year, n = 1, with_ties = FALSE) |> ungroup() disc_source_year <- disc_long |> group_by(variable) |> summarise(source_year = max(year), .groups = "drop") disc_wide <- disc_long |> select(-year) |> pivot_wider(names_from = variable, values_from = value) # 07-Apply all fills (window first, then census and disc which only touch the # still-NA cells, i.e. the 100%-missing variables ------------------------------ m_2022_filled <- m_2022 |> rows_patch(window_wide, by = "code", unmatched = "ignore") |> rows_patch(census_wide, by = "code", unmatched = "ignore") |> rows_patch(disc_wide, by = "code", unmatched = "ignore") # 08-Remove truly empty variables and the redundant year column ---------------- vars_drop <- variable_plan |> filter(action == "drop") |> pull(variable) m_sample <- m_2022_filled |> select(-all_of(vars_drop), -year) # Check residual NAs na_remaining <- m_sample |> summarise(across(-c(code, municipality), \(x) sum(is.na(x)))) |> pivot_longer(everything(), names_to = "variable", values_to = "n_na_final") |> filter(n_na_final > 0) # 09-Audit --------------------------------------------------------------------- # Combine source-year info from all three fill paths source_years <- bind_rows( window_source_year, census_source_year, disc_source_year ) sampling_audit <- variable_plan |> left_join(source_years, by = "variable") |> left_join(na_remaining, by = "variable") |> mutate( n_na_final = replace_na(n_na_final, 0L), years_from_2022 = if_else( !is.na(source_year), abs(source_year - 2022L), NA_integer_ ), note = case_when( action == "keep" & is.na(fill_type) ~ "Complete in 2022", fill_type == "residual_na" ~ paste0( n_na, " municipalities (", round(pct_na * 100, 1), "%) have no 2022 value ", "and no [2020, 2021] data (variable measured only in specific years, e.g. elections)" ), action == "drop" ~ "Dropped — no data anywhere in the panel", fill_type == "full_column" ~ paste0( "Entire column backfilled from ", source_year, " (100% missing in 2022)" ), fill_type == "partial" ~ paste0( round(pct_na * 100, 1), "% missing in 2022; ", "per-municipality fill from ≤", source_year, if_else( n_na_final > 0, paste0("; ", n_na_final, " municipalities remain NA"), "; fully resolved" ) ), fill_type == "census_closest" ~ paste0( "Periodic variable — no data in 2022; backfilled from ", source_year, " (", years_from_2022, " years from anchor)" ), fill_type == "discontinued_last" ~ paste0( "Series discontinued; backfilled from most recent available year (", source_year, ", ", years_from_2022, " years from anchor)" ) ) ) |> select( variable, action, fill_type, n_na_2022 = n_na, pct_na_2022 = pct_na, n_na_final, first_year, last_year, source_year, years_from_2022, note ) |> arrange(action, fill_type, desc(years_from_2022)) # 10-Save ---------------------------------------------------------------------- write_rds(m_sample, "data/processed/m_sample.rds") write_csv(sampling_audit, "data/processed/sampling_audit.csv")