working on proportions

This commit is contained in:
pab 2026-05-22 09:26:29 +02:00
parent 9a9b42d654
commit b1fdb08399
4 changed files with 1525 additions and 0 deletions

Binary file not shown.

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,363 @@
# =============================================================================
# proportions_CA.R · CA n°3 (revised) of the 2022 municipal cross-section
# =============================================================================
#
# Same variables as CA n°3 in CA_municipalities.qmd, with two changes:
#
# 1. Educational provision (forskoleklass, grundskola, gymnasieskola,
# komvux, sfi, university, university_college and the adapted/special/
# Sami variants; total, public, private) is moved from active to
# supplementary. The CA now describes the population/dwelling/mobility
# profile and provision is projected on top of it.
#
# 2. Counts are block-normalised: within each variable domain (edu, empl,
# housing, workplace, migrations, retirees, localities, provision,
# opinion) every municipality is rescaled to a common per-row total.
# Large cities no longer capture the first axis as a size axis.
#
# Inputs (no scraping/sampling re-run):
# data/Municipalities_db_2.xlsx: dataset with the 16-category employment
# and "Number of retirees"
# data/processed/edu_offer.rds: institution counts (skolverket + UKÄ)
#
# Output:
# data/processed/proportions_CA.rds: CA result (FactoMineR object)
library(tidyverse)
library(readxl)
library(FactoMineR)
library(factoextra)
library(ggrepel)
library(explor)
# 01 - Load --------------------------------------------------------------------
panel_raw <- read_excel(
"data/Municipalities_db_2.xlsx",
col_types = "text"
)
id_cols <- c("year", "code", "municipality")
panel <- panel_raw |>
mutate(year = as.integer(year)) |>
mutate(across(-all_of(id_cols), \(x) suppressWarnings(as.numeric(x))))
# 02 - Backfilled 2022 cross-section ------------------------------------------
# (01-sampling.R): use 2022 if available, then a window backfill from
# [2020, 2021], then census-closest, then discontinued-most-recent.
avail_full <- panel |>
pivot_longer(-all_of(id_cols), names_to = "variable", values_to = "value") |>
filter(!is.na(value)) |>
summarise(.by = variable,
first_year = min(year), last_year = max(year))
avail_window <- panel |>
filter(between(year, 2020, 2021)) |>
pivot_longer(-all_of(id_cols), names_to = "variable", values_to = "value") |>
filter(!is.na(value)) |>
summarise(.by = variable, window_last_year = max(year))
m_2022 <- panel |> filter(year == 2022)
n_munic <- nrow(m_2022)
na_2022 <- m_2022 |>
summarise(across(-all_of(id_cols), \(x) sum(is.na(x)))) |>
pivot_longer(everything(), names_to = "variable", values_to = "n_na") |>
mutate(pct_na = n_na / n_munic)
variable_plan <- na_2022 |>
left_join(avail_full, by = "variable") |>
left_join(avail_window, by = "variable") |>
mutate(action = case_when(
n_na == 0 ~ "keep",
n_na > 0 & n_na < n_munic & is.na(window_last_year) ~ "keep",
n_na > 0 & !is.na(window_last_year) ~ "backfill_window",
n_na == n_munic & is.na(window_last_year) & !is.na(last_year) &
last_year > 2022 ~ "backfill_census",
n_na == n_munic & is.na(window_last_year) & !is.na(last_year) &
last_year <= 2022 ~ "backfill_disc",
is.na(last_year) ~ "drop"
))
backfill_latest <- function(panel_df, vars, year_filter) {
if (length(vars) == 0) return(tibble(code = character()))
panel_df |>
filter(year_filter(year)) |>
select(year, code, all_of(vars)) |>
pivot_longer(all_of(vars), names_to = "variable", values_to = "value") |>
filter(!is.na(value)) |>
mutate(distance = abs(year - 2022)) |>
group_by(code, variable) |>
slice_min(distance, n = 1, with_ties = FALSE) |>
ungroup() |>
select(-year, -distance) |>
pivot_wider(names_from = variable, values_from = value)
}
vars_window <- variable_plan |> filter(action == "backfill_window") |> pull(variable)
vars_census <- variable_plan |> filter(action == "backfill_census") |> pull(variable)
vars_disc <- variable_plan |> filter(action == "backfill_disc") |> pull(variable)
window_wide <- backfill_latest(panel, vars_window,
\(y) between(y, 2020, 2021))
census_wide <- backfill_latest(panel, vars_census, \(y) y != 2022)
disc_wide <- backfill_latest(panel, vars_disc, \(y) y < 2022)
vars_drop <- variable_plan |> filter(action == "drop") |> pull(variable)
m <- m_2022 |>
rows_patch(window_wide, by = "code", unmatched = "ignore") |>
rows_patch(census_wide, by = "code", unmatched = "ignore") |>
rows_patch(disc_wide, by = "code", unmatched = "ignore") |>
select(-all_of(vars_drop), -year) |>
mutate(code = str_pad(as.character(code), 4, "left", "0"))
edu_offer <- read_rds("data/processed/edu_offer.rds") |>
mutate(code = str_pad(as.character(code), 4, "left", "0")) |>
select(-municipality)
m <- left_join(m, edu_offer, by = "code")
# 02 - Variable blocks ---------------------------------------------------------
vars_edu <- c(
"Education level_primary_secondary",
"Education level_upper_secondary",
"Education level_post-secondary",
"Education level_post-graduate"
)
vars_empl <- c(
"Employment_16cat_companies in agriculture, forestry and fishing",
"Employment_16cat_mining, quarrying, manufacturing",
"Employment_16cat_energy and environmental companies",
"Employment_16cat_construction industry",
"Employment_16cat_trade",
"Employment_16cat_transport and storage companies",
"Employment_16cat_hotels and restaurants",
"Employment_16cat_information and communication companies",
"Employment_16cat_financial institutions and insurance companies",
"Employment_16cat_real estate companies",
"Employment_16cat_professional, scientific and technical companies; administrative and support service companies",
"Employment_16cat_public authorities and national defence",
"Employment_16cat_educational establishments",
"Employment_16cat_human health and social work establishments",
"Employment_16cat_establishments for arts, entertainment and recreation; other service companies etc.",
"Employment_16cat_unknown activity"
)
m <- m |>
mutate(
rented_total =
`Rented dwellings_one- or two-dwelling buildings` +
`Rented dwellings_multi-dwelling buildings` +
`Rented dwellings_other buildings` +
`Rented dwellings_special housing`,
tenant_owned_total =
`Tenant-owned dwellings_one- or two-dwelling buildings` +
`Tenant-owned dwellings_multi-dwelling buildings` +
`Tenant-owned dwellings_other buildings` +
`Tenant-owned dwellings_special housing`,
owner_occupied_total =
`Owner-occupied dwellings_one- or two-dwelling buildings` +
`Owner-occupied dwellings_multi-dwelling buildings` +
`Owner-occupied dwellings_other buildings` +
`Owner-occupied dwellings_special housing`
)
vars_housing <- c("rented_total", "tenant_owned_total", "owner_occupied_total")
vars_workplace <- c(
"Workplace_Commuters coming into the municipality",
"Workplace_Commuters leaving the municipality",
"Workplace_Working and living in the municipality"
)
vars_migration <- c("Number of inmigrations", "Number of outmigrations")
# Provision: all institution types, including the small ones (specialskola,
# sameskola, university). Rare types project further from the origin because
# their column profile is concentrated in a handful of host municipalities;
# the per-capita rate transformation below moderates this but does not
# eliminate it.
vars_provision <- c(
# Totals
"forskoleklass_n_total", "grundskola_n_total", "anpassad_grundskola_n_total",
"specialskola_n_total", "sameskola_n_total", "gymnasieskola_n_total",
"anpassad_gymnasieskola_n_total", "komvux_n_total", "sfi_n_total",
"university_n_total", "university_college_n_total",
# Public
"forskoleklass_n_public", "grundskola_n_public", "anpassad_grundskola_n_public",
"specialskola_n_public", "sameskola_n_public", "gymnasieskola_n_public",
"anpassad_gymnasieskola_n_public", "komvux_n_public", "sfi_n_public",
"university_n_public", "university_college_n_public",
# Private
"forskoleklass_n_private", "grundskola_n_private", "anpassad_grundskola_n_private",
"specialskola_n_private", "sameskola_n_private", "gymnasieskola_n_private",
"anpassad_gymnasieskola_n_private", "komvux_n_private", "sfi_n_private",
"university_n_private", "university_college_n_private"
)
vars_opinion <- c(
"Opinion on preschool_Bad (%)", "Opinion on preschool_Mid (%)",
"Opinion on preschool_Good (%)",
"Opinion on elementary school_Bad (%)", "Opinion on elementary school_Mid (%)",
"Opinion on elementary school_Good (%)",
"Opinion on highschool_Bad (%)", "Opinion on highschool_Mid (%)",
"Opinion on highschool_Good (%)"
)
vars_demography <- c("Number of retirees", "Number of localities")
vars_active <- c(vars_edu, vars_empl, vars_housing, vars_workplace,
vars_migration, vars_demography)
vars_supp <- c(vars_provision, vars_opinion)
vars_all <- c(vars_active, vars_supp)
# 03 - Build matrix and impute NAs with 0 --------------------------------------
tab <- m |> select(municipality, all_of(vars_all)) |> as.data.frame()
rownames(tab) <- tab$municipality
tab$municipality <- NULL
tab[is.na(tab)] <- 0
# 04 - Block normalisation -----------------------------------------------------
# Within each domain, scale every municipality's block to sum to 1000.
# Effect on CA:
# - row mass becomes equal across municipalities, no size effect on axes
# - each domain contributes the same total mass, no block dominates
normalise_block <- function(mat, cols, total = 1000) {
s <- rowSums(mat[, cols, drop = FALSE])
s[s == 0] <- 1
mat[, cols] <- mat[, cols] / s * total
mat
}
# Active blocks + opinion: normalise so axes are size-independent.
for (block in list(vars_edu, vars_empl, vars_housing, vars_workplace,
vars_migration, vars_demography, vars_opinion)) {
tab <- normalise_block(tab, block)
}
# Provision: convert to a per-capita rate (institutions per 100k inhabitants,
# using the pre-normalisation education total as a population proxy). Absolute
# counts produce highly concentrated column profiles for rare types (a handful
# of municipalities host all universities, etc.), which throws supplementary
# projections far outside the active cloud and makes the biplot unreadable.
# Per-capita rates are roughly comparable across municipalities of different
# size, so column profiles are more diffuse and the projections land where
# they can actually be read against the active modalities.
pop_proxy <- m |>
mutate(across(all_of(vars_edu), \(x) replace_na(x, 0))) |>
transmute(municipality, pop = rowSums(across(all_of(vars_edu)))) |>
deframe()
pop_proxy <- pop_proxy[rownames(tab)]
pop_proxy[pop_proxy == 0 | is.na(pop_proxy)] <- 1
tab[, vars_provision] <- sweep(tab[, vars_provision], 1, pop_proxy, `/`) * 1e5
tab_mat <- as.matrix(tab)
# 05 - Rename columns to short, plottable labels -------------------------------
label_map <- c(
# Education
"Education level_primary_secondary" = "Primary & lower secondary",
"Education level_upper_secondary" = "Upper secondary",
"Education level_post-secondary" = "Post-secondary",
"Education level_post-graduate" = "Post-graduate",
# Employment 16cat
"Employment_16cat_companies in agriculture, forestry and fishing" = "Agriculture, forestry, fishing",
"Employment_16cat_mining, quarrying, manufacturing" = "Mining & manufacturing",
"Employment_16cat_energy and environmental companies" = "Energy & environment",
"Employment_16cat_construction industry" = "Construction",
"Employment_16cat_trade" = "Trade",
"Employment_16cat_transport and storage companies" = "Transport & storage",
"Employment_16cat_hotels and restaurants" = "Hotels & restaurants",
"Employment_16cat_information and communication companies" = "IT & communication",
"Employment_16cat_financial institutions and insurance companies" = "Finance & insurance",
"Employment_16cat_real estate companies" = "Real estate",
"Employment_16cat_professional, scientific and technical companies; administrative and support service companies" = "Professional & technical services",
"Employment_16cat_public authorities and national defence" = "Public authorities & defence",
"Employment_16cat_educational establishments" = "Educational establishments",
"Employment_16cat_human health and social work establishments" = "Health & social work",
"Employment_16cat_establishments for arts, entertainment and recreation; other service companies etc." = "Arts, entertainment & recreation",
"Employment_16cat_unknown activity" = "Unknown activity",
# Housing
rented_total = "Rented dwellings",
tenant_owned_total = "Tenant-owned dwellings",
owner_occupied_total = "Owner-occupied dwellings",
# Workplace
"Workplace_Commuters coming into the municipality" = "Commuters in",
"Workplace_Commuters leaving the municipality" = "Commuters out",
"Workplace_Working and living in the municipality" = "Working & living in municipality",
# Migration & demography
"Number of inmigrations" = "Inmigrations",
"Number of outmigrations" = "Outmigrations",
"Number of retirees" = "Retirees",
"Number of localities" = "Localities",
# Provision (supp) — totals
forskoleklass_n_total = "Preschool class",
grundskola_n_total = "Primary school",
anpassad_grundskola_n_total = "Adapted primary",
specialskola_n_total = "Special school",
sameskola_n_total = "Sami school",
gymnasieskola_n_total = "Secondary school",
anpassad_gymnasieskola_n_total = "Adapted secondary",
komvux_n_total = "Komvux",
sfi_n_total = "SFI",
university_n_total = "University",
university_college_n_total = "University college",
# Provision (supp) — public
forskoleklass_n_public = "Public preschool class",
grundskola_n_public = "Public primary school",
anpassad_grundskola_n_public = "Public adapted primary",
specialskola_n_public = "Public special school",
sameskola_n_public = "Public Sami school",
gymnasieskola_n_public = "Public secondary school",
anpassad_gymnasieskola_n_public = "Public adapted secondary",
komvux_n_public = "Public Komvux",
sfi_n_public = "Public SFI",
university_n_public = "Public university",
university_college_n_public = "Public university college",
# Provision (supp) — private
forskoleklass_n_private = "Private preschool class",
grundskola_n_private = "Private primary school",
anpassad_grundskola_n_private = "Private adapted primary",
specialskola_n_private = "Private special school",
sameskola_n_private = "Private Sami school",
gymnasieskola_n_private = "Private secondary school",
anpassad_gymnasieskola_n_private = "Private adapted secondary",
komvux_n_private = "Private Komvux",
sfi_n_private = "Private SFI",
university_n_private = "Private university",
university_college_n_private = "Private university college",
# Opinion (supp)
"Opinion on preschool_Bad (%)" = "Bad opinion on preschool",
"Opinion on preschool_Mid (%)" = "Mid opinion on preschool",
"Opinion on preschool_Good (%)" = "Good opinion on preschool",
"Opinion on elementary school_Bad (%)" = "Bad opinion on elementary school",
"Opinion on elementary school_Mid (%)" = "Mid opinion on elementary school",
"Opinion on elementary school_Good (%)" = "Good opinion on elementary school",
"Opinion on highschool_Bad (%)" = "Bad opinion on highschool",
"Opinion on highschool_Mid (%)" = "Mid opinion on highschool",
"Opinion on highschool_Good (%)" = "Good opinion on highschool"
)
colnames(tab_mat) <- unname(label_map[colnames(tab_mat)])
supp_labels <- unname(label_map[vars_supp])
supp_idx <- which(colnames(tab_mat) %in% supp_labels)
# Drop any all-zero columns (e.g. an institution type with no private units
# anywhere) — they break CA's supplementary projection.
zero_cols <- which(colSums(tab_mat) == 0)
if (length(zero_cols) > 0) {
cat("Dropping all-zero columns:",
paste(colnames(tab_mat)[zero_cols], collapse = ", "), "\n")
tab_mat <- tab_mat[, -zero_cols]
supp_idx <- which(colnames(tab_mat) %in% supp_labels)
}
# 06 - Run CA ------------------------------------------------------------------
afc <- CA(tab_mat, col.sup = supp_idx, graph = FALSE)
# 07 - Save --------------------------------------------------------------------
write_rds(afc, "data/processed/proportions_CA.rds")
# 08 - Quick biplots for visual check ------------------------------------------
explor(afc)