working on proportions

2026-05-22 09:26:29 +02:00 · 2026-05-22 09:26:29 +02:00 · b1fdb08399
commit b1fdb08399
parent 9a9b42d654
4 changed files with 1525 additions and 0 deletions
--- a/data/Municipalities_db_2.xlsx
+++ b/data/Municipalities_db_2.xlsx
--- a/src/Clara/CA_municipalities.qmd
+++ b/src/Clara/CA_municipalities.qmd
--- a/src/municipalities/04-proportions_CA.R
+++ b/src/municipalities/04-proportions_CA.R
@ -0,0 +1,363 @@
 # =============================================================================
 #  proportions_CA.R  ·  CA n°3 (revised) of the 2022 municipal cross-section
 # =============================================================================
 #
 #  Same variables as CA n°3 in CA_municipalities.qmd, with two changes:
 #
 #   1. Educational provision (forskoleklass, grundskola, gymnasieskola,
 #      komvux, sfi, university, university_college and the adapted/special/
 #      Sami variants; total, public, private) is moved from active to
 #      supplementary. The CA now describes the population/dwelling/mobility
 #      profile and provision is projected on top of it.
 #
 #   2. Counts are block-normalised: within each variable domain (edu, empl,
 #      housing, workplace, migrations, retirees, localities, provision,
 #      opinion) every municipality is rescaled to a common per-row total.
 #      Large cities no longer capture the first axis as a size axis.
 #
 #  Inputs (no scraping/sampling re-run):
 #    data/Municipalities_db_2.xlsx:  dataset with the 16-category employment
 #                                    and "Number of retirees"
 #    data/processed/edu_offer.rds:   institution counts (skolverket + UKÄ)
 #
 #  Output:
 #    data/processed/proportions_CA.rds: CA result (FactoMineR object)
 library(tidyverse)
 library(readxl)
 library(FactoMineR)
 library(factoextra)
 library(ggrepel)
 library(explor)
 # 01 - Load --------------------------------------------------------------------
 panel_raw <- read_excel(
  "data/Municipalities_db_2.xlsx",
  col_types = "text"
 )
 id_cols <- c("year", "code", "municipality")
 panel <- panel_raw |>
  mutate(year = as.integer(year)) |>
  mutate(across(-all_of(id_cols), \(x) suppressWarnings(as.numeric(x))))
 # 02 - Backfilled 2022 cross-section ------------------------------------------
 # (01-sampling.R): use 2022 if available, then a window backfill from
 # [2020, 2021], then census-closest, then discontinued-most-recent.
 avail_full <- panel |>
  pivot_longer(-all_of(id_cols), names_to = "variable", values_to = "value") |>
  filter(!is.na(value)) |>
  summarise(.by = variable,
            first_year = min(year), last_year = max(year))
 avail_window <- panel |>
  filter(between(year, 2020, 2021)) |>
  pivot_longer(-all_of(id_cols), names_to = "variable", values_to = "value") |>
  filter(!is.na(value)) |>
  summarise(.by = variable, window_last_year = max(year))
 m_2022 <- panel |> filter(year == 2022)
 n_munic <- nrow(m_2022)
 na_2022 <- m_2022 |>
  summarise(across(-all_of(id_cols), \(x) sum(is.na(x)))) |>
  pivot_longer(everything(), names_to = "variable", values_to = "n_na") |>
  mutate(pct_na = n_na / n_munic)
 variable_plan <- na_2022 |>
  left_join(avail_full,   by = "variable") |>
  left_join(avail_window, by = "variable") |>
  mutate(action = case_when(
    n_na == 0                                                              ~ "keep",
    n_na > 0 & n_na < n_munic & is.na(window_last_year)                    ~ "keep",
    n_na > 0 & !is.na(window_last_year)                                    ~ "backfill_window",
    n_na == n_munic & is.na(window_last_year) & !is.na(last_year) &
      last_year > 2022                                                     ~ "backfill_census",
    n_na == n_munic & is.na(window_last_year) & !is.na(last_year) &
      last_year <= 2022                                                    ~ "backfill_disc",
    is.na(last_year)                                                       ~ "drop"
  ))
 backfill_latest <- function(panel_df, vars, year_filter) {
  if (length(vars) == 0) return(tibble(code = character()))
  panel_df |>
    filter(year_filter(year)) |>
    select(year, code, all_of(vars)) |>
    pivot_longer(all_of(vars), names_to = "variable", values_to = "value") |>
    filter(!is.na(value)) |>
    mutate(distance = abs(year - 2022)) |>
    group_by(code, variable) |>
    slice_min(distance, n = 1, with_ties = FALSE) |>
    ungroup() |>
    select(-year, -distance) |>
    pivot_wider(names_from = variable, values_from = value)
 }
 vars_window <- variable_plan |> filter(action == "backfill_window") |> pull(variable)
 vars_census <- variable_plan |> filter(action == "backfill_census") |> pull(variable)
 vars_disc   <- variable_plan |> filter(action == "backfill_disc")   |> pull(variable)
 window_wide <- backfill_latest(panel, vars_window,
                               \(y) between(y, 2020, 2021))
 census_wide <- backfill_latest(panel, vars_census, \(y) y != 2022)
 disc_wide   <- backfill_latest(panel, vars_disc,   \(y) y <  2022)
 vars_drop <- variable_plan |> filter(action == "drop") |> pull(variable)
 m <- m_2022 |>
  rows_patch(window_wide, by = "code", unmatched = "ignore") |>
  rows_patch(census_wide, by = "code", unmatched = "ignore") |>
  rows_patch(disc_wide,   by = "code", unmatched = "ignore") |>
  select(-all_of(vars_drop), -year) |>
  mutate(code = str_pad(as.character(code), 4, "left", "0"))
 edu_offer <- read_rds("data/processed/edu_offer.rds") |>
  mutate(code = str_pad(as.character(code), 4, "left", "0")) |>
  select(-municipality)
 m <- left_join(m, edu_offer, by = "code")
 # 02 - Variable blocks ---------------------------------------------------------
 vars_edu <- c(
  "Education level_primary_secondary",
  "Education level_upper_secondary",
  "Education level_post-secondary",
  "Education level_post-graduate"
 )
 vars_empl <- c(
  "Employment_16cat_companies in agriculture, forestry and fishing",
  "Employment_16cat_mining, quarrying, manufacturing",
  "Employment_16cat_energy and environmental companies",
  "Employment_16cat_construction industry",
  "Employment_16cat_trade",
  "Employment_16cat_transport and storage companies",
  "Employment_16cat_hotels and restaurants",
  "Employment_16cat_information and communication companies",
  "Employment_16cat_financial institutions and insurance companies",
  "Employment_16cat_real estate companies",
  "Employment_16cat_professional, scientific and technical companies; administrative and support service companies",
  "Employment_16cat_public authorities and national defence",
  "Employment_16cat_educational establishments",
  "Employment_16cat_human health and social work establishments",
  "Employment_16cat_establishments for arts, entertainment and recreation; other service companies etc.",
  "Employment_16cat_unknown activity"
 )
 m <- m |>
  mutate(
    rented_total =
      `Rented dwellings_one- or two-dwelling buildings` +
      `Rented dwellings_multi-dwelling buildings` +
      `Rented dwellings_other buildings` +
      `Rented dwellings_special housing`,
    tenant_owned_total =
      `Tenant-owned dwellings_one- or two-dwelling buildings` +
      `Tenant-owned dwellings_multi-dwelling buildings` +
      `Tenant-owned dwellings_other buildings` +
      `Tenant-owned dwellings_special housing`,
    owner_occupied_total =
      `Owner-occupied dwellings_one- or two-dwelling buildings` +
      `Owner-occupied dwellings_multi-dwelling buildings` +
      `Owner-occupied dwellings_other buildings` +
      `Owner-occupied dwellings_special housing`
  )
 vars_housing <- c("rented_total", "tenant_owned_total", "owner_occupied_total")
 vars_workplace <- c(
  "Workplace_Commuters coming into the municipality",
  "Workplace_Commuters leaving the municipality",
  "Workplace_Working and living in the municipality"
 )
 vars_migration <- c("Number of inmigrations", "Number of outmigrations")
 # Provision: all institution types, including the small ones (specialskola,
 # sameskola, university). Rare types project further from the origin because
 # their column profile is concentrated in a handful of host municipalities;
 # the per-capita rate transformation below moderates this but does not
 # eliminate it.
 vars_provision <- c(
  # Totals
  "forskoleklass_n_total", "grundskola_n_total", "anpassad_grundskola_n_total",
  "specialskola_n_total", "sameskola_n_total", "gymnasieskola_n_total",
  "anpassad_gymnasieskola_n_total", "komvux_n_total", "sfi_n_total",
  "university_n_total", "university_college_n_total",
  # Public
  "forskoleklass_n_public", "grundskola_n_public", "anpassad_grundskola_n_public",
  "specialskola_n_public", "sameskola_n_public", "gymnasieskola_n_public",
  "anpassad_gymnasieskola_n_public", "komvux_n_public", "sfi_n_public",
  "university_n_public", "university_college_n_public",
  # Private
  "forskoleklass_n_private", "grundskola_n_private", "anpassad_grundskola_n_private",
  "specialskola_n_private", "sameskola_n_private", "gymnasieskola_n_private",
  "anpassad_gymnasieskola_n_private", "komvux_n_private", "sfi_n_private",
  "university_n_private", "university_college_n_private"
 )
 vars_opinion <- c(
  "Opinion on preschool_Bad (%)", "Opinion on preschool_Mid (%)",
  "Opinion on preschool_Good (%)",
  "Opinion on elementary school_Bad (%)", "Opinion on elementary school_Mid (%)",
  "Opinion on elementary school_Good (%)",
  "Opinion on highschool_Bad (%)", "Opinion on highschool_Mid (%)",
  "Opinion on highschool_Good (%)"
 )
 vars_demography <- c("Number of retirees", "Number of localities")
 vars_active <- c(vars_edu, vars_empl, vars_housing, vars_workplace,
                 vars_migration, vars_demography)
 vars_supp   <- c(vars_provision, vars_opinion)
 vars_all    <- c(vars_active, vars_supp)
 # 03 - Build matrix and impute NAs with 0 --------------------------------------
 tab <- m |> select(municipality, all_of(vars_all)) |> as.data.frame()
 rownames(tab) <- tab$municipality
 tab$municipality <- NULL
 tab[is.na(tab)] <- 0
 # 04 - Block normalisation -----------------------------------------------------
 # Within each domain, scale every municipality's block to sum to 1000.
 # Effect on CA:
 #   - row mass becomes equal across municipalities, no size effect on axes
 #   - each domain contributes the same total mass, no block dominates
 normalise_block <- function(mat, cols, total = 1000) {
  s <- rowSums(mat[, cols, drop = FALSE])
  s[s == 0] <- 1
  mat[, cols] <- mat[, cols] / s * total
  mat
 }
 # Active blocks + opinion: normalise so axes are size-independent.
 for (block in list(vars_edu, vars_empl, vars_housing, vars_workplace,
                   vars_migration, vars_demography, vars_opinion)) {
  tab <- normalise_block(tab, block)
 }
 # Provision: convert to a per-capita rate (institutions per 100k inhabitants,
 # using the pre-normalisation education total as a population proxy). Absolute
 # counts produce highly concentrated column profiles for rare types (a handful
 # of municipalities host all universities, etc.), which throws supplementary
 # projections far outside the active cloud and makes the biplot unreadable.
 # Per-capita rates are roughly comparable across municipalities of different
 # size, so column profiles are more diffuse and the projections land where
 # they can actually be read against the active modalities.
 pop_proxy <- m |>
  mutate(across(all_of(vars_edu), \(x) replace_na(x, 0))) |>
  transmute(municipality, pop = rowSums(across(all_of(vars_edu)))) |>
  deframe()
 pop_proxy <- pop_proxy[rownames(tab)]
 pop_proxy[pop_proxy == 0 | is.na(pop_proxy)] <- 1
 tab[, vars_provision] <- sweep(tab[, vars_provision], 1, pop_proxy, `/`) * 1e5
 tab_mat <- as.matrix(tab)
 # 05 - Rename columns to short, plottable labels -------------------------------
 label_map <- c(
  # Education
  "Education level_primary_secondary" = "Primary & lower secondary",
  "Education level_upper_secondary"   = "Upper secondary",
  "Education level_post-secondary"    = "Post-secondary",
  "Education level_post-graduate"     = "Post-graduate",
  # Employment 16cat
  "Employment_16cat_companies in agriculture, forestry and fishing" = "Agriculture, forestry, fishing",
  "Employment_16cat_mining, quarrying, manufacturing"               = "Mining & manufacturing",
  "Employment_16cat_energy and environmental companies"             = "Energy & environment",
  "Employment_16cat_construction industry"                          = "Construction",
  "Employment_16cat_trade"                                          = "Trade",
  "Employment_16cat_transport and storage companies"                = "Transport & storage",
  "Employment_16cat_hotels and restaurants"                         = "Hotels & restaurants",
  "Employment_16cat_information and communication companies"        = "IT & communication",
  "Employment_16cat_financial institutions and insurance companies" = "Finance & insurance",
  "Employment_16cat_real estate companies"                          = "Real estate",
  "Employment_16cat_professional, scientific and technical companies; administrative and support service companies" = "Professional & technical services",
  "Employment_16cat_public authorities and national defence"        = "Public authorities & defence",
  "Employment_16cat_educational establishments"                     = "Educational establishments",
  "Employment_16cat_human health and social work establishments"    = "Health & social work",
  "Employment_16cat_establishments for arts, entertainment and recreation; other service companies etc." = "Arts, entertainment & recreation",
  "Employment_16cat_unknown activity"                               = "Unknown activity",
  # Housing
  rented_total          = "Rented dwellings",
  tenant_owned_total    = "Tenant-owned dwellings",
  owner_occupied_total  = "Owner-occupied dwellings",
  # Workplace
  "Workplace_Commuters coming into the municipality" = "Commuters in",
  "Workplace_Commuters leaving the municipality"     = "Commuters out",
  "Workplace_Working and living in the municipality" = "Working & living in municipality",
  # Migration & demography
  "Number of inmigrations"  = "Inmigrations",
  "Number of outmigrations" = "Outmigrations",
  "Number of retirees"      = "Retirees",
  "Number of localities"    = "Localities",
  # Provision (supp) — totals
  forskoleklass_n_total          = "Preschool class",
  grundskola_n_total             = "Primary school",
  anpassad_grundskola_n_total    = "Adapted primary",
  specialskola_n_total           = "Special school",
  sameskola_n_total              = "Sami school",
  gymnasieskola_n_total          = "Secondary school",
  anpassad_gymnasieskola_n_total = "Adapted secondary",
  komvux_n_total                 = "Komvux",
  sfi_n_total                    = "SFI",
  university_n_total             = "University",
  university_college_n_total     = "University college",
  # Provision (supp) — public
  forskoleklass_n_public          = "Public preschool class",
  grundskola_n_public             = "Public primary school",
  anpassad_grundskola_n_public    = "Public adapted primary",
  specialskola_n_public           = "Public special school",
  sameskola_n_public              = "Public Sami school",
  gymnasieskola_n_public          = "Public secondary school",
  anpassad_gymnasieskola_n_public = "Public adapted secondary",
  komvux_n_public                 = "Public Komvux",
  sfi_n_public                    = "Public SFI",
  university_n_public             = "Public university",
  university_college_n_public     = "Public university college",
  # Provision (supp) — private
  forskoleklass_n_private          = "Private preschool class",
  grundskola_n_private             = "Private primary school",
  anpassad_grundskola_n_private    = "Private adapted primary",
  specialskola_n_private           = "Private special school",
  sameskola_n_private              = "Private Sami school",
  gymnasieskola_n_private          = "Private secondary school",
  anpassad_gymnasieskola_n_private = "Private adapted secondary",
  komvux_n_private                 = "Private Komvux",
  sfi_n_private                    = "Private SFI",
  university_n_private             = "Private university",
  university_college_n_private     = "Private university college",
  # Opinion (supp)
  "Opinion on preschool_Bad (%)"           = "Bad opinion on preschool",
  "Opinion on preschool_Mid (%)"           = "Mid opinion on preschool",
  "Opinion on preschool_Good (%)"          = "Good opinion on preschool",
  "Opinion on elementary school_Bad (%)"   = "Bad opinion on elementary school",
  "Opinion on elementary school_Mid (%)"   = "Mid opinion on elementary school",
  "Opinion on elementary school_Good (%)"  = "Good opinion on elementary school",
  "Opinion on highschool_Bad (%)"          = "Bad opinion on highschool",
  "Opinion on highschool_Mid (%)"          = "Mid opinion on highschool",
  "Opinion on highschool_Good (%)"         = "Good opinion on highschool"
 )
 colnames(tab_mat) <- unname(label_map[colnames(tab_mat)])
 supp_labels <- unname(label_map[vars_supp])
 supp_idx <- which(colnames(tab_mat) %in% supp_labels)
 # Drop any all-zero columns (e.g. an institution type with no private units
 # anywhere) — they break CA's supplementary projection.
 zero_cols <- which(colSums(tab_mat) == 0)
 if (length(zero_cols) > 0) {
  cat("Dropping all-zero columns:",
      paste(colnames(tab_mat)[zero_cols], collapse = ", "), "\n")
  tab_mat <- tab_mat[, -zero_cols]
  supp_idx <- which(colnames(tab_mat) %in% supp_labels)
 }
 # 06 - Run CA ------------------------------------------------------------------
 afc <- CA(tab_mat, col.sup = supp_idx, graph = FALSE)
 # 07 - Save --------------------------------------------------------------------
 write_rds(afc, "data/processed/proportions_CA.rds")
 # 08 - Quick biplots for visual check ------------------------------------------
 explor(afc)
--- a/src/scraping/skolverket.R
+++ b/src/scraping/skolverket.R