ruralitic-qrm/src/municipalities/01-sampling.R

# =============================================================================
#  01-sampling.R  ·  2022 cross-section for municipalities CA
# =============================================================================
#
#  Takes the full dataset (1968–2026, 290 municipalities) and produces a single
#  clean cross-section anchored to 2022.
#
#  Each variable is assigned one of five actions:
#
#    keep                 complete (or near-complete) in 2022; used as-is.
#                         Variables with a handful of NAs and no backfill data
#                         (e.g. election variables, 3-4 missing per cycle)
#                         are also kept; residual NAs are noted in the audit.
#
#    backfill_window      missing in 2022 (fully or partially), but data exist
#                         in [2020, 2021]: fill each municipality from its most
#                         recent non-NA value in that window.
#
#    backfill_census      100% missing in 2022, but the series is still active
#                         (data exists AFTER 2022, i.e. a periodic/census
#                         variable such as EU parliament elections).
#                         Strategy: use the year closest to 2022, looking both
#                         backwards and forwards.
#
#    backfill_disc        100% missing in 2022, and the series was discontinued
#                         before 2022 (data only exists in the past).
#                         Strategy: use the most recent available year for each
#                         municipality, regardless of how long ago it was.
#
#    drop                 no data at all in the full panel; excluded.
#
#  Outputs:
#    data/processed/m_sample.rds       : 290 municipalities × retained variables
#    data/processed/sampling_audit.csv : variable-level audit log

library(tidyverse)

# 00-Load ----------------------------------------------------------------------
municipalities_raw <- read_rds("data/processed/m_raw.rds")
# Panel: 17 110 rows (290 municipalities × ~59 years), 257 columns.

# 01-Characterise each variable's availability ---------------------------------
# 1a. Full dataset: first and last year with any non-NA value.
availability_full <- municipalities_raw |>
  pivot_longer(
    cols = -c(year, code, municipality),
    names_to = "variable",
    values_to = "value"
  ) |>
  filter(!is.na(value)) |>
  summarise(
    .by = variable,
    first_year = min(year),
    last_year = max(year)
  )

# 1b. Backfill window [2020, 2021]: the most recent year with any non-NA value.
#     If a variable has nothing here, the window-backfill path is unavailable.
availability_window <- municipalities_raw |>
  filter(between(year, 2020, 2021)) |>
  pivot_longer(
    cols = -c(year, code, municipality),
    names_to = "variable",
    values_to = "value"
  ) |>
  filter(!is.na(value)) |>
  summarise(
    .by = variable,
    window_last_year = max(year)
  )

# 02-Extract the 2022 slice and assess NAs -------------------------------------
m_2022 <- municipalities_raw |> filter(year == 2022)
n_munic <- nrow(m_2022) # 290

na_2022 <- m_2022 |>
  summarise(across(-c(year, code, municipality), \(x) sum(is.na(x)))) |>
  pivot_longer(everything(), names_to = "variable", values_to = "n_na") |>
  mutate(pct_na = n_na / n_munic)

# 03-Classify every variable ---------------------------------------------------
variable_plan <- na_2022 |>
  left_join(availability_full, by = "variable") |>
  left_join(availability_window, by = "variable") |>
  mutate(
    action = case_when(
      # Already fine in 2022 (including partial NAs with no window fill available)
      n_na == 0 ~ "keep",
      n_na > 0 & n_na < n_munic & is.na(window_last_year) ~ "keep",

      # Gap in 2022 but window data available; standard window backfill
      n_na > 0 & !is.na(window_last_year) ~ "backfill_window",

      # Fully missing in 2022 series is still active (data after 2022); periodic
      n_na == n_munic &
        is.na(window_last_year) &
        !is.na(last_year) &
        last_year > 2022 ~ "backfill_census",

      # Fully missing in 2022; series ended before 2022; discontinued
      n_na == n_munic &
        is.na(window_last_year) &
        !is.na(last_year) &
        last_year <= 2022 ~ "backfill_disc",

      # No data anywhere in the dataset
      is.na(last_year) ~ "drop"
    ),
    # Sub-type for reporting
    fill_type = case_when(
      action == "backfill_window" & n_na == n_munic ~ "full_column",
      action == "backfill_window" & n_na < n_munic ~ "partial",
      action == "keep" & n_na > 0 ~ "residual_na",
      action == "backfill_census" ~ "census_closest",
      action == "backfill_disc" ~ "discontinued_last",
      TRUE ~ NA_character_
    )
  )

# 04-Backfill A: window [2020, 2021] Per-municipality, most recent non-NA value.
vars_window <- variable_plan |>
  filter(action == "backfill_window") |>
  pull(variable)

window_long <- municipalities_raw |>
  filter(between(year, 2020, 2021)) |>
  select(year, code, all_of(vars_window)) |>
  pivot_longer(
    all_of(vars_window),
    names_to = "variable",
    values_to = "value"
  ) |>
  filter(!is.na(value)) |>
  group_by(code, variable) |>
  slice_max(year, n = 1, with_ties = FALSE) |>
  ungroup()

window_source_year <- window_long |>
  group_by(variable) |>
  summarise(source_year = max(year), .groups = "drop")

window_wide <- window_long |>
  select(-year) |>
  pivot_wider(names_from = variable, values_from = value)

# 05-Backfill B: census / periodic Per-municipality, year closest to 2022 in
# either direction.
# (Applies to EU parliament election variables: 2024 is 2 years away,
# 2019 is 3 years away, so 2024 will be selected for all municipalities.)
# ------------------------------------------------------------------------------
vars_census <- variable_plan |>
  filter(action == "backfill_census") |>
  pull(variable)

census_long <- municipalities_raw |>
  filter(year != 2022) |>
  select(year, code, all_of(vars_census)) |>
  pivot_longer(
    all_of(vars_census),
    names_to = "variable",
    values_to = "value"
  ) |>
  filter(!is.na(value)) |>
  mutate(distance = abs(year - 2022)) |>
  group_by(code, variable) |>
  slice_min(distance, n = 1, with_ties = FALSE) |>
  ungroup()

census_source_year <- census_long |>
  group_by(variable) |>
  summarise(source_year = max(year), .groups = "drop")

census_wide <- census_long |>
  select(-year, -distance) |>
  pivot_wider(names_from = variable, values_from = value)

# 06-Backfill C: discontinued --------------------------------------------------
vars_disc <- variable_plan |>
  filter(action == "backfill_disc") |>
  pull(variable)

disc_long <- municipalities_raw |>
  filter(year < 2022) |>
  select(year, code, all_of(vars_disc)) |>
  pivot_longer(all_of(vars_disc), names_to = "variable", values_to = "value") |>
  filter(!is.na(value)) |>
  group_by(code, variable) |>
  slice_max(year, n = 1, with_ties = FALSE) |>
  ungroup()

disc_source_year <- disc_long |>
  group_by(variable) |>
  summarise(source_year = max(year), .groups = "drop")

disc_wide <- disc_long |>
  select(-year) |>
  pivot_wider(names_from = variable, values_from = value)

# 07-Apply all fills (window first, then census and disc which only touch the
# still-NA cells, i.e. the 100%-missing variables ------------------------------
m_2022_filled <- m_2022 |>
  rows_patch(window_wide, by = "code", unmatched = "ignore") |>
  rows_patch(census_wide, by = "code", unmatched = "ignore") |>
  rows_patch(disc_wide, by = "code", unmatched = "ignore")

# 08-Remove truly empty variables and the redundant year column ----------------
vars_drop <- variable_plan |> filter(action == "drop") |> pull(variable)

m_sample <- m_2022_filled |>
  select(-all_of(vars_drop), -year)

# Check residual NAs
na_remaining <- m_sample |>
  summarise(across(-c(code, municipality), \(x) sum(is.na(x)))) |>
  pivot_longer(everything(), names_to = "variable", values_to = "n_na_final") |>
  filter(n_na_final > 0)

# 09-Audit ---------------------------------------------------------------------
# Combine source-year info from all three fill paths
source_years <- bind_rows(
  window_source_year,
  census_source_year,
  disc_source_year
)

sampling_audit <- variable_plan |>
  left_join(source_years, by = "variable") |>
  left_join(na_remaining, by = "variable") |>
  mutate(
    n_na_final = replace_na(n_na_final, 0L),
    years_from_2022 = if_else(
      !is.na(source_year),
      abs(source_year - 2022L),
      NA_integer_
    ),
    note = case_when(
      action == "keep" & is.na(fill_type) ~
        "Complete in 2022",

      fill_type == "residual_na" ~
        paste0(
          n_na,
          " municipalities (",
          round(pct_na * 100, 1),
          "%) have no 2022 value ",
          "and no [2020, 2021] data (variable measured only in specific years, e.g. elections)"
        ),

      action == "drop" ~
        "Dropped — no data anywhere in the panel",

      fill_type == "full_column" ~
        paste0(
          "Entire column backfilled from ",
          source_year,
          " (100% missing in 2022)"
        ),

      fill_type == "partial" ~
        paste0(
          round(pct_na * 100, 1),
          "% missing in 2022; ",
          "per-municipality fill from ≤",
          source_year,
          if_else(
            n_na_final > 0,
            paste0("; ", n_na_final, " municipalities remain NA"),
            "; fully resolved"
          )
        ),

      fill_type == "census_closest" ~
        paste0(
          "Periodic variable — no data in 2022; backfilled from ",
          source_year,
          " (",
          years_from_2022,
          " years from anchor)"
        ),

      fill_type == "discontinued_last" ~
        paste0(
          "Series discontinued; backfilled from most recent available year (",
          source_year,
          ", ",
          years_from_2022,
          " years from anchor)"
        )
    )
  ) |>
  select(
    variable,
    action,
    fill_type,
    n_na_2022 = n_na,
    pct_na_2022 = pct_na,
    n_na_final,
    first_year,
    last_year,
    source_year,
    years_from_2022,
    note
  ) |>
  arrange(action, fill_type, desc(years_from_2022))


# 10-Save ----------------------------------------------------------------------
write_rds(m_sample, "data/processed/m_sample.rds")
write_csv(sampling_audit, "data/processed/sampling_audit.csv")