ruralitic-qrm/src/municipalities/03-attainment-ts.R
2026-05-08 10:15:20 +02:00

125 lines
3.8 KiB
R
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# =============================================================================
# 03-attainment-ts.R · Educational attainment time-series from SCB PxWeb
# =============================================================================
#
# Table: UF0506B/Utbildning
# "Befolkning 16-74 år efter region, utbildningsnivå, ålder och kön. År 1985-2025"
#
# We fetch ages 25-64 (standard working-age window for education attainment),
# both genders, all 7 SUN education levels, for selected years.
# The API imposes an undocumented cell limit (~150k per request), so we
# batch across municipalities (50 per request).
#
# Education levels (SUN):
# 1-2 = förgymnasial (pre-secondary)
# 3-4 = gymnasial (upper secondary)
# 5-6 = eftergymnasial (post-secondary, non-doctoral)
# 7 = forskarutbildning (doctoral)
#
# Output: data/processed/attainment_ts.rds
# Columns: code, year, edu_level (1-7), n
# Also a summary: code, year, pct_postsec, n_25_64
library(tidyverse)
library(httr)
library(jsonlite)
SCB_URL <- "https://api.scb.se/OV0104/v1/doris/sv/ssd/START/UF/UF0506/UF0506B/Utbildning"
YEARS <- c("2000", "2005", "2010", "2015", "2022")
LEVELS <- as.list(as.character(1:7))
AGES <- as.list(as.character(25:64))
GENDERS <- list("1", "2")
# Fetch municipality codes from table metadata
meta <- fromJSON(content(GET(SCB_URL), "text", encoding = "UTF-8"))
munis <- Filter(\(x) nchar(x) == 4, meta$variables$values[[1]])
# Batch fetch: 50 municipalities per request
fetch_batch <- function(muni_batch) {
query <- list(
query = list(
list(
code = "Region",
selection = list(filter = "item", values = as.list(muni_batch))
),
list(code = "Alder", selection = list(filter = "item", values = AGES)),
list(
code = "UtbildningsNiva",
selection = list(filter = "item", values = LEVELS)
),
list(code = "Kon", selection = list(filter = "item", values = GENDERS)),
list(
code = "Tid",
selection = list(filter = "item", values = as.list(YEARS))
)
),
response = list(format = "json")
)
resp <- POST(
SCB_URL,
body = toJSON(query, auto_unbox = TRUE),
encode = "raw",
content_type("application/json"),
timeout(60)
)
if (status_code(resp) != 200) {
stop("HTTP ", status_code(resp), " for batch starting at ", muni_batch[1])
}
d <- fromJSON(content(resp, "text", encoding = "UTF-8"))$data
# key is a list of character vectors: [region, age, edu_level, gender, year]
keys <- do.call(rbind, d$key)
tibble(
code = keys[, 1],
age = as.integer(keys[, 2]),
edu_level = as.integer(keys[, 3]),
gender = as.integer(keys[, 4]),
year = as.integer(keys[, 5]),
n = as.integer(unlist(d$values))
)
}
batches <- split(munis, ceiling(seq_along(munis) / 50))
raw_list <- vector("list", length(batches))
for (i in seq_along(batches)) {
cat(
" Fetching batch",
i,
"/",
length(batches),
"(munis",
batches[[i]][1],
"",
tail(batches[[i]], 1),
")\n"
)
raw_list[[i]] <- tryCatch(
fetch_batch(batches[[i]]),
error = function(e) {
message(" Batch ", i, " failed: ", conditionMessage(e))
NULL
}
)
Sys.sleep(0.3)
}
raw <- bind_rows(compact(raw_list))
# Aggregate: sum across ages and genders → n per (code, year, edu_level)
attainment <- raw |>
group_by(code, year, edu_level) |>
summarise(n = sum(n, na.rm = TRUE), .groups = "drop")
# Summary: % with post-secondary education among 25-64 year olds
attainment_summary <- attainment |>
group_by(code, year) |>
summarise(
n_total = sum(n),
n_postsec = sum(n[edu_level >= 5]),
pct_postsec = 100 * n_postsec / n_total,
.groups = "drop"
)
write_rds(attainment, "data/processed/attainment_ts.rds")
write_rds(attainment_summary, "data/processed/attainment_summary.rds")