working on proportions
This commit is contained in:
parent
9a9b42d654
commit
b1fdb08399
4 changed files with 1525 additions and 0 deletions
BIN
data/Municipalities_db_2.xlsx
Normal file
BIN
data/Municipalities_db_2.xlsx
Normal file
Binary file not shown.
1162
src/Clara/CA_municipalities.qmd
Normal file
1162
src/Clara/CA_municipalities.qmd
Normal file
File diff suppressed because it is too large
Load diff
363
src/municipalities/04-proportions_CA.R
Normal file
363
src/municipalities/04-proportions_CA.R
Normal file
|
|
@ -0,0 +1,363 @@
|
||||||
|
# =============================================================================
|
||||||
|
# proportions_CA.R · CA n°3 (revised) of the 2022 municipal cross-section
|
||||||
|
# =============================================================================
|
||||||
|
#
|
||||||
|
# Same variables as CA n°3 in CA_municipalities.qmd, with two changes:
|
||||||
|
#
|
||||||
|
# 1. Educational provision (forskoleklass, grundskola, gymnasieskola,
|
||||||
|
# komvux, sfi, university, university_college and the adapted/special/
|
||||||
|
# Sami variants; total, public, private) is moved from active to
|
||||||
|
# supplementary. The CA now describes the population/dwelling/mobility
|
||||||
|
# profile and provision is projected on top of it.
|
||||||
|
#
|
||||||
|
# 2. Counts are block-normalised: within each variable domain (edu, empl,
|
||||||
|
# housing, workplace, migrations, retirees, localities, provision,
|
||||||
|
# opinion) every municipality is rescaled to a common per-row total.
|
||||||
|
# Large cities no longer capture the first axis as a size axis.
|
||||||
|
#
|
||||||
|
# Inputs (no scraping/sampling re-run):
|
||||||
|
# data/Municipalities_db_2.xlsx: dataset with the 16-category employment
|
||||||
|
# and "Number of retirees"
|
||||||
|
# data/processed/edu_offer.rds: institution counts (skolverket + UKÄ)
|
||||||
|
#
|
||||||
|
# Output:
|
||||||
|
# data/processed/proportions_CA.rds: CA result (FactoMineR object)
|
||||||
|
|
||||||
|
library(tidyverse)
|
||||||
|
library(readxl)
|
||||||
|
library(FactoMineR)
|
||||||
|
library(factoextra)
|
||||||
|
library(ggrepel)
|
||||||
|
library(explor)
|
||||||
|
|
||||||
|
# 01 - Load --------------------------------------------------------------------
|
||||||
|
panel_raw <- read_excel(
|
||||||
|
"data/Municipalities_db_2.xlsx",
|
||||||
|
col_types = "text"
|
||||||
|
)
|
||||||
|
|
||||||
|
id_cols <- c("year", "code", "municipality")
|
||||||
|
panel <- panel_raw |>
|
||||||
|
mutate(year = as.integer(year)) |>
|
||||||
|
mutate(across(-all_of(id_cols), \(x) suppressWarnings(as.numeric(x))))
|
||||||
|
|
||||||
|
# 02 - Backfilled 2022 cross-section ------------------------------------------
|
||||||
|
# (01-sampling.R): use 2022 if available, then a window backfill from
|
||||||
|
# [2020, 2021], then census-closest, then discontinued-most-recent.
|
||||||
|
|
||||||
|
avail_full <- panel |>
|
||||||
|
pivot_longer(-all_of(id_cols), names_to = "variable", values_to = "value") |>
|
||||||
|
filter(!is.na(value)) |>
|
||||||
|
summarise(.by = variable,
|
||||||
|
first_year = min(year), last_year = max(year))
|
||||||
|
|
||||||
|
avail_window <- panel |>
|
||||||
|
filter(between(year, 2020, 2021)) |>
|
||||||
|
pivot_longer(-all_of(id_cols), names_to = "variable", values_to = "value") |>
|
||||||
|
filter(!is.na(value)) |>
|
||||||
|
summarise(.by = variable, window_last_year = max(year))
|
||||||
|
|
||||||
|
m_2022 <- panel |> filter(year == 2022)
|
||||||
|
n_munic <- nrow(m_2022)
|
||||||
|
|
||||||
|
na_2022 <- m_2022 |>
|
||||||
|
summarise(across(-all_of(id_cols), \(x) sum(is.na(x)))) |>
|
||||||
|
pivot_longer(everything(), names_to = "variable", values_to = "n_na") |>
|
||||||
|
mutate(pct_na = n_na / n_munic)
|
||||||
|
|
||||||
|
variable_plan <- na_2022 |>
|
||||||
|
left_join(avail_full, by = "variable") |>
|
||||||
|
left_join(avail_window, by = "variable") |>
|
||||||
|
mutate(action = case_when(
|
||||||
|
n_na == 0 ~ "keep",
|
||||||
|
n_na > 0 & n_na < n_munic & is.na(window_last_year) ~ "keep",
|
||||||
|
n_na > 0 & !is.na(window_last_year) ~ "backfill_window",
|
||||||
|
n_na == n_munic & is.na(window_last_year) & !is.na(last_year) &
|
||||||
|
last_year > 2022 ~ "backfill_census",
|
||||||
|
n_na == n_munic & is.na(window_last_year) & !is.na(last_year) &
|
||||||
|
last_year <= 2022 ~ "backfill_disc",
|
||||||
|
is.na(last_year) ~ "drop"
|
||||||
|
))
|
||||||
|
|
||||||
|
backfill_latest <- function(panel_df, vars, year_filter) {
|
||||||
|
if (length(vars) == 0) return(tibble(code = character()))
|
||||||
|
panel_df |>
|
||||||
|
filter(year_filter(year)) |>
|
||||||
|
select(year, code, all_of(vars)) |>
|
||||||
|
pivot_longer(all_of(vars), names_to = "variable", values_to = "value") |>
|
||||||
|
filter(!is.na(value)) |>
|
||||||
|
mutate(distance = abs(year - 2022)) |>
|
||||||
|
group_by(code, variable) |>
|
||||||
|
slice_min(distance, n = 1, with_ties = FALSE) |>
|
||||||
|
ungroup() |>
|
||||||
|
select(-year, -distance) |>
|
||||||
|
pivot_wider(names_from = variable, values_from = value)
|
||||||
|
}
|
||||||
|
|
||||||
|
vars_window <- variable_plan |> filter(action == "backfill_window") |> pull(variable)
|
||||||
|
vars_census <- variable_plan |> filter(action == "backfill_census") |> pull(variable)
|
||||||
|
vars_disc <- variable_plan |> filter(action == "backfill_disc") |> pull(variable)
|
||||||
|
|
||||||
|
window_wide <- backfill_latest(panel, vars_window,
|
||||||
|
\(y) between(y, 2020, 2021))
|
||||||
|
census_wide <- backfill_latest(panel, vars_census, \(y) y != 2022)
|
||||||
|
disc_wide <- backfill_latest(panel, vars_disc, \(y) y < 2022)
|
||||||
|
|
||||||
|
vars_drop <- variable_plan |> filter(action == "drop") |> pull(variable)
|
||||||
|
|
||||||
|
m <- m_2022 |>
|
||||||
|
rows_patch(window_wide, by = "code", unmatched = "ignore") |>
|
||||||
|
rows_patch(census_wide, by = "code", unmatched = "ignore") |>
|
||||||
|
rows_patch(disc_wide, by = "code", unmatched = "ignore") |>
|
||||||
|
select(-all_of(vars_drop), -year) |>
|
||||||
|
mutate(code = str_pad(as.character(code), 4, "left", "0"))
|
||||||
|
|
||||||
|
edu_offer <- read_rds("data/processed/edu_offer.rds") |>
|
||||||
|
mutate(code = str_pad(as.character(code), 4, "left", "0")) |>
|
||||||
|
select(-municipality)
|
||||||
|
|
||||||
|
m <- left_join(m, edu_offer, by = "code")
|
||||||
|
|
||||||
|
# 02 - Variable blocks ---------------------------------------------------------
|
||||||
|
vars_edu <- c(
|
||||||
|
"Education level_primary_secondary",
|
||||||
|
"Education level_upper_secondary",
|
||||||
|
"Education level_post-secondary",
|
||||||
|
"Education level_post-graduate"
|
||||||
|
)
|
||||||
|
|
||||||
|
vars_empl <- c(
|
||||||
|
"Employment_16cat_companies in agriculture, forestry and fishing",
|
||||||
|
"Employment_16cat_mining, quarrying, manufacturing",
|
||||||
|
"Employment_16cat_energy and environmental companies",
|
||||||
|
"Employment_16cat_construction industry",
|
||||||
|
"Employment_16cat_trade",
|
||||||
|
"Employment_16cat_transport and storage companies",
|
||||||
|
"Employment_16cat_hotels and restaurants",
|
||||||
|
"Employment_16cat_information and communication companies",
|
||||||
|
"Employment_16cat_financial institutions and insurance companies",
|
||||||
|
"Employment_16cat_real estate companies",
|
||||||
|
"Employment_16cat_professional, scientific and technical companies; administrative and support service companies",
|
||||||
|
"Employment_16cat_public authorities and national defence",
|
||||||
|
"Employment_16cat_educational establishments",
|
||||||
|
"Employment_16cat_human health and social work establishments",
|
||||||
|
"Employment_16cat_establishments for arts, entertainment and recreation; other service companies etc.",
|
||||||
|
"Employment_16cat_unknown activity"
|
||||||
|
)
|
||||||
|
|
||||||
|
m <- m |>
|
||||||
|
mutate(
|
||||||
|
rented_total =
|
||||||
|
`Rented dwellings_one- or two-dwelling buildings` +
|
||||||
|
`Rented dwellings_multi-dwelling buildings` +
|
||||||
|
`Rented dwellings_other buildings` +
|
||||||
|
`Rented dwellings_special housing`,
|
||||||
|
tenant_owned_total =
|
||||||
|
`Tenant-owned dwellings_one- or two-dwelling buildings` +
|
||||||
|
`Tenant-owned dwellings_multi-dwelling buildings` +
|
||||||
|
`Tenant-owned dwellings_other buildings` +
|
||||||
|
`Tenant-owned dwellings_special housing`,
|
||||||
|
owner_occupied_total =
|
||||||
|
`Owner-occupied dwellings_one- or two-dwelling buildings` +
|
||||||
|
`Owner-occupied dwellings_multi-dwelling buildings` +
|
||||||
|
`Owner-occupied dwellings_other buildings` +
|
||||||
|
`Owner-occupied dwellings_special housing`
|
||||||
|
)
|
||||||
|
|
||||||
|
vars_housing <- c("rented_total", "tenant_owned_total", "owner_occupied_total")
|
||||||
|
|
||||||
|
vars_workplace <- c(
|
||||||
|
"Workplace_Commuters coming into the municipality",
|
||||||
|
"Workplace_Commuters leaving the municipality",
|
||||||
|
"Workplace_Working and living in the municipality"
|
||||||
|
)
|
||||||
|
|
||||||
|
vars_migration <- c("Number of inmigrations", "Number of outmigrations")
|
||||||
|
|
||||||
|
# Provision: all institution types, including the small ones (specialskola,
|
||||||
|
# sameskola, university). Rare types project further from the origin because
|
||||||
|
# their column profile is concentrated in a handful of host municipalities;
|
||||||
|
# the per-capita rate transformation below moderates this but does not
|
||||||
|
# eliminate it.
|
||||||
|
vars_provision <- c(
|
||||||
|
# Totals
|
||||||
|
"forskoleklass_n_total", "grundskola_n_total", "anpassad_grundskola_n_total",
|
||||||
|
"specialskola_n_total", "sameskola_n_total", "gymnasieskola_n_total",
|
||||||
|
"anpassad_gymnasieskola_n_total", "komvux_n_total", "sfi_n_total",
|
||||||
|
"university_n_total", "university_college_n_total",
|
||||||
|
# Public
|
||||||
|
"forskoleklass_n_public", "grundskola_n_public", "anpassad_grundskola_n_public",
|
||||||
|
"specialskola_n_public", "sameskola_n_public", "gymnasieskola_n_public",
|
||||||
|
"anpassad_gymnasieskola_n_public", "komvux_n_public", "sfi_n_public",
|
||||||
|
"university_n_public", "university_college_n_public",
|
||||||
|
# Private
|
||||||
|
"forskoleklass_n_private", "grundskola_n_private", "anpassad_grundskola_n_private",
|
||||||
|
"specialskola_n_private", "sameskola_n_private", "gymnasieskola_n_private",
|
||||||
|
"anpassad_gymnasieskola_n_private", "komvux_n_private", "sfi_n_private",
|
||||||
|
"university_n_private", "university_college_n_private"
|
||||||
|
)
|
||||||
|
|
||||||
|
vars_opinion <- c(
|
||||||
|
"Opinion on preschool_Bad (%)", "Opinion on preschool_Mid (%)",
|
||||||
|
"Opinion on preschool_Good (%)",
|
||||||
|
"Opinion on elementary school_Bad (%)", "Opinion on elementary school_Mid (%)",
|
||||||
|
"Opinion on elementary school_Good (%)",
|
||||||
|
"Opinion on highschool_Bad (%)", "Opinion on highschool_Mid (%)",
|
||||||
|
"Opinion on highschool_Good (%)"
|
||||||
|
)
|
||||||
|
|
||||||
|
vars_demography <- c("Number of retirees", "Number of localities")
|
||||||
|
|
||||||
|
vars_active <- c(vars_edu, vars_empl, vars_housing, vars_workplace,
|
||||||
|
vars_migration, vars_demography)
|
||||||
|
vars_supp <- c(vars_provision, vars_opinion)
|
||||||
|
vars_all <- c(vars_active, vars_supp)
|
||||||
|
|
||||||
|
# 03 - Build matrix and impute NAs with 0 --------------------------------------
|
||||||
|
tab <- m |> select(municipality, all_of(vars_all)) |> as.data.frame()
|
||||||
|
rownames(tab) <- tab$municipality
|
||||||
|
tab$municipality <- NULL
|
||||||
|
tab[is.na(tab)] <- 0
|
||||||
|
|
||||||
|
# 04 - Block normalisation -----------------------------------------------------
|
||||||
|
# Within each domain, scale every municipality's block to sum to 1000.
|
||||||
|
# Effect on CA:
|
||||||
|
# - row mass becomes equal across municipalities, no size effect on axes
|
||||||
|
# - each domain contributes the same total mass, no block dominates
|
||||||
|
normalise_block <- function(mat, cols, total = 1000) {
|
||||||
|
s <- rowSums(mat[, cols, drop = FALSE])
|
||||||
|
s[s == 0] <- 1
|
||||||
|
mat[, cols] <- mat[, cols] / s * total
|
||||||
|
mat
|
||||||
|
}
|
||||||
|
|
||||||
|
# Active blocks + opinion: normalise so axes are size-independent.
|
||||||
|
for (block in list(vars_edu, vars_empl, vars_housing, vars_workplace,
|
||||||
|
vars_migration, vars_demography, vars_opinion)) {
|
||||||
|
tab <- normalise_block(tab, block)
|
||||||
|
}
|
||||||
|
|
||||||
|
# Provision: convert to a per-capita rate (institutions per 100k inhabitants,
|
||||||
|
# using the pre-normalisation education total as a population proxy). Absolute
|
||||||
|
# counts produce highly concentrated column profiles for rare types (a handful
|
||||||
|
# of municipalities host all universities, etc.), which throws supplementary
|
||||||
|
# projections far outside the active cloud and makes the biplot unreadable.
|
||||||
|
# Per-capita rates are roughly comparable across municipalities of different
|
||||||
|
# size, so column profiles are more diffuse and the projections land where
|
||||||
|
# they can actually be read against the active modalities.
|
||||||
|
pop_proxy <- m |>
|
||||||
|
mutate(across(all_of(vars_edu), \(x) replace_na(x, 0))) |>
|
||||||
|
transmute(municipality, pop = rowSums(across(all_of(vars_edu)))) |>
|
||||||
|
deframe()
|
||||||
|
pop_proxy <- pop_proxy[rownames(tab)]
|
||||||
|
pop_proxy[pop_proxy == 0 | is.na(pop_proxy)] <- 1
|
||||||
|
tab[, vars_provision] <- sweep(tab[, vars_provision], 1, pop_proxy, `/`) * 1e5
|
||||||
|
|
||||||
|
tab_mat <- as.matrix(tab)
|
||||||
|
|
||||||
|
# 05 - Rename columns to short, plottable labels -------------------------------
|
||||||
|
label_map <- c(
|
||||||
|
# Education
|
||||||
|
"Education level_primary_secondary" = "Primary & lower secondary",
|
||||||
|
"Education level_upper_secondary" = "Upper secondary",
|
||||||
|
"Education level_post-secondary" = "Post-secondary",
|
||||||
|
"Education level_post-graduate" = "Post-graduate",
|
||||||
|
# Employment 16cat
|
||||||
|
"Employment_16cat_companies in agriculture, forestry and fishing" = "Agriculture, forestry, fishing",
|
||||||
|
"Employment_16cat_mining, quarrying, manufacturing" = "Mining & manufacturing",
|
||||||
|
"Employment_16cat_energy and environmental companies" = "Energy & environment",
|
||||||
|
"Employment_16cat_construction industry" = "Construction",
|
||||||
|
"Employment_16cat_trade" = "Trade",
|
||||||
|
"Employment_16cat_transport and storage companies" = "Transport & storage",
|
||||||
|
"Employment_16cat_hotels and restaurants" = "Hotels & restaurants",
|
||||||
|
"Employment_16cat_information and communication companies" = "IT & communication",
|
||||||
|
"Employment_16cat_financial institutions and insurance companies" = "Finance & insurance",
|
||||||
|
"Employment_16cat_real estate companies" = "Real estate",
|
||||||
|
"Employment_16cat_professional, scientific and technical companies; administrative and support service companies" = "Professional & technical services",
|
||||||
|
"Employment_16cat_public authorities and national defence" = "Public authorities & defence",
|
||||||
|
"Employment_16cat_educational establishments" = "Educational establishments",
|
||||||
|
"Employment_16cat_human health and social work establishments" = "Health & social work",
|
||||||
|
"Employment_16cat_establishments for arts, entertainment and recreation; other service companies etc." = "Arts, entertainment & recreation",
|
||||||
|
"Employment_16cat_unknown activity" = "Unknown activity",
|
||||||
|
# Housing
|
||||||
|
rented_total = "Rented dwellings",
|
||||||
|
tenant_owned_total = "Tenant-owned dwellings",
|
||||||
|
owner_occupied_total = "Owner-occupied dwellings",
|
||||||
|
# Workplace
|
||||||
|
"Workplace_Commuters coming into the municipality" = "Commuters in",
|
||||||
|
"Workplace_Commuters leaving the municipality" = "Commuters out",
|
||||||
|
"Workplace_Working and living in the municipality" = "Working & living in municipality",
|
||||||
|
# Migration & demography
|
||||||
|
"Number of inmigrations" = "Inmigrations",
|
||||||
|
"Number of outmigrations" = "Outmigrations",
|
||||||
|
"Number of retirees" = "Retirees",
|
||||||
|
"Number of localities" = "Localities",
|
||||||
|
# Provision (supp) — totals
|
||||||
|
forskoleklass_n_total = "Preschool class",
|
||||||
|
grundskola_n_total = "Primary school",
|
||||||
|
anpassad_grundskola_n_total = "Adapted primary",
|
||||||
|
specialskola_n_total = "Special school",
|
||||||
|
sameskola_n_total = "Sami school",
|
||||||
|
gymnasieskola_n_total = "Secondary school",
|
||||||
|
anpassad_gymnasieskola_n_total = "Adapted secondary",
|
||||||
|
komvux_n_total = "Komvux",
|
||||||
|
sfi_n_total = "SFI",
|
||||||
|
university_n_total = "University",
|
||||||
|
university_college_n_total = "University college",
|
||||||
|
# Provision (supp) — public
|
||||||
|
forskoleklass_n_public = "Public preschool class",
|
||||||
|
grundskola_n_public = "Public primary school",
|
||||||
|
anpassad_grundskola_n_public = "Public adapted primary",
|
||||||
|
specialskola_n_public = "Public special school",
|
||||||
|
sameskola_n_public = "Public Sami school",
|
||||||
|
gymnasieskola_n_public = "Public secondary school",
|
||||||
|
anpassad_gymnasieskola_n_public = "Public adapted secondary",
|
||||||
|
komvux_n_public = "Public Komvux",
|
||||||
|
sfi_n_public = "Public SFI",
|
||||||
|
university_n_public = "Public university",
|
||||||
|
university_college_n_public = "Public university college",
|
||||||
|
# Provision (supp) — private
|
||||||
|
forskoleklass_n_private = "Private preschool class",
|
||||||
|
grundskola_n_private = "Private primary school",
|
||||||
|
anpassad_grundskola_n_private = "Private adapted primary",
|
||||||
|
specialskola_n_private = "Private special school",
|
||||||
|
sameskola_n_private = "Private Sami school",
|
||||||
|
gymnasieskola_n_private = "Private secondary school",
|
||||||
|
anpassad_gymnasieskola_n_private = "Private adapted secondary",
|
||||||
|
komvux_n_private = "Private Komvux",
|
||||||
|
sfi_n_private = "Private SFI",
|
||||||
|
university_n_private = "Private university",
|
||||||
|
university_college_n_private = "Private university college",
|
||||||
|
# Opinion (supp)
|
||||||
|
"Opinion on preschool_Bad (%)" = "Bad opinion on preschool",
|
||||||
|
"Opinion on preschool_Mid (%)" = "Mid opinion on preschool",
|
||||||
|
"Opinion on preschool_Good (%)" = "Good opinion on preschool",
|
||||||
|
"Opinion on elementary school_Bad (%)" = "Bad opinion on elementary school",
|
||||||
|
"Opinion on elementary school_Mid (%)" = "Mid opinion on elementary school",
|
||||||
|
"Opinion on elementary school_Good (%)" = "Good opinion on elementary school",
|
||||||
|
"Opinion on highschool_Bad (%)" = "Bad opinion on highschool",
|
||||||
|
"Opinion on highschool_Mid (%)" = "Mid opinion on highschool",
|
||||||
|
"Opinion on highschool_Good (%)" = "Good opinion on highschool"
|
||||||
|
)
|
||||||
|
colnames(tab_mat) <- unname(label_map[colnames(tab_mat)])
|
||||||
|
supp_labels <- unname(label_map[vars_supp])
|
||||||
|
supp_idx <- which(colnames(tab_mat) %in% supp_labels)
|
||||||
|
|
||||||
|
# Drop any all-zero columns (e.g. an institution type with no private units
|
||||||
|
# anywhere) — they break CA's supplementary projection.
|
||||||
|
zero_cols <- which(colSums(tab_mat) == 0)
|
||||||
|
if (length(zero_cols) > 0) {
|
||||||
|
cat("Dropping all-zero columns:",
|
||||||
|
paste(colnames(tab_mat)[zero_cols], collapse = ", "), "\n")
|
||||||
|
tab_mat <- tab_mat[, -zero_cols]
|
||||||
|
supp_idx <- which(colnames(tab_mat) %in% supp_labels)
|
||||||
|
}
|
||||||
|
|
||||||
|
# 06 - Run CA ------------------------------------------------------------------
|
||||||
|
afc <- CA(tab_mat, col.sup = supp_idx, graph = FALSE)
|
||||||
|
|
||||||
|
# 07 - Save --------------------------------------------------------------------
|
||||||
|
write_rds(afc, "data/processed/proportions_CA.rds")
|
||||||
|
|
||||||
|
# 08 - Quick biplots for visual check ------------------------------------------
|
||||||
|
explor(afc)
|
||||||
Loading…
Add table
Reference in a new issue