ruralitic-qrm/src/municipalities/03-attainment-ts.R

118 lines
4.1 KiB
R
Raw Normal View History

# =============================================================================
# 03-attainment-ts.R · Educational attainment time-series from SCB PxWeb
# =============================================================================
#
# Table: UF0506B/Utbildning
# "Befolkning 16-74 år efter region, utbildningsnivå, ålder och kön. År 1985-2025"
#
# We fetch ages 25-64 (standard working-age window for education attainment),
# both genders, all 7 SUN education levels, for selected years.
# The API imposes an undocumented cell limit (~150k per request), so we
# batch across municipalities (50 per request).
#
# Education levels (SUN):
# 1-2 = förgymnasial (pre-secondary)
# 3-4 = gymnasial (upper secondary)
# 5-6 = eftergymnasial (post-secondary, non-doctoral)
# 7 = forskarutbildning (doctoral)
#
# Output: data/processed/attainment_ts.rds
# Columns: code, year, edu_level (1-7), n
# Also a summary: code, year, pct_postsec, n_25_64
library(tidyverse)
library(httr)
library(jsonlite)
SCB_URL <- "https://api.scb.se/OV0104/v1/doris/sv/ssd/START/UF/UF0506/UF0506B/Utbildning"
YEARS <- c("2000", "2005", "2010", "2015", "2022")
LEVELS <- as.list(as.character(1:7))
AGES <- as.list(as.character(25:64))
GENDERS <- list("1", "2")
# Fetch municipality codes from table metadata
meta <- fromJSON(content(GET(SCB_URL), "text", encoding = "UTF-8"))
munis <- Filter(\(x) nchar(x) == 4, meta$variables$values[[1]])
cat("Municipalities to fetch:", length(munis), "\n")
# Batch fetch: 50 municipalities per request
fetch_batch <- function(muni_batch) {
query <- list(
query = list(
list(code = "Region",
selection = list(filter = "item", values = as.list(muni_batch))),
list(code = "Alder",
selection = list(filter = "item", values = AGES)),
list(code = "UtbildningsNiva",
selection = list(filter = "item", values = LEVELS)),
list(code = "Kon",
selection = list(filter = "item", values = GENDERS)),
list(code = "Tid",
selection = list(filter = "item", values = as.list(YEARS)))
),
response = list(format = "json")
)
resp <- POST(
SCB_URL,
body = toJSON(query, auto_unbox = TRUE),
encode = "raw",
content_type("application/json"),
timeout(60)
)
if (status_code(resp) != 200) {
stop("HTTP ", status_code(resp), " for batch starting at ", muni_batch[1])
}
d <- fromJSON(content(resp, "text", encoding = "UTF-8"))$data
# key is a list of character vectors: [region, age, edu_level, gender, year]
keys <- do.call(rbind, d$key)
tibble(
code = keys[, 1],
age = as.integer(keys[, 2]),
edu_level = as.integer(keys[, 3]),
gender = as.integer(keys[, 4]),
year = as.integer(keys[, 5]),
n = as.integer(unlist(d$values))
)
}
batches <- split(munis, ceiling(seq_along(munis) / 50))
raw_list <- vector("list", length(batches))
for (i in seq_along(batches)) {
cat(" Fetching batch", i, "/", length(batches),
"(munis", batches[[i]][1], "", tail(batches[[i]], 1), ")\n")
raw_list[[i]] <- tryCatch(
fetch_batch(batches[[i]]),
error = function(e) {
message(" Batch ", i, " failed: ", conditionMessage(e))
NULL
}
)
Sys.sleep(0.3)
}
raw <- bind_rows(compact(raw_list))
cat("Total rows fetched:", nrow(raw), "\n")
# Aggregate: sum across ages and genders → n per (code, year, edu_level)
attainment <- raw |>
group_by(code, year, edu_level) |>
summarise(n = sum(n, na.rm = TRUE), .groups = "drop")
# Summary: % with post-secondary education among 25-64 year olds
attainment_summary <- attainment |>
group_by(code, year) |>
summarise(
n_total = sum(n),
n_postsec = sum(n[edu_level >= 5]),
pct_postsec = 100 * n_postsec / n_total,
.groups = "drop"
)
write_rds(attainment, "data/processed/attainment_ts.rds")
write_rds(attainment_summary, "data/processed/attainment_summary.rds")
cat("Saved attainment_ts.rds and attainment_summary.rds\n")
cat("Years:", paste(sort(unique(attainment$year)), collapse = ", "), "\n")
cat("Municipalities:", n_distinct(attainment$code), "\n")