# ============================================================================= # 03-attainment-ts.R · Educational attainment time-series from SCB PxWeb # ============================================================================= # # Table: UF0506B/Utbildning # "Befolkning 16-74 år efter region, utbildningsnivå, ålder och kön. År 1985-2025" # # We fetch ages 25-64 (standard working-age window for education attainment), # both genders, all 7 SUN education levels, for selected years. # The API imposes an undocumented cell limit (~150k per request), so we # batch across municipalities (50 per request). # # Education levels (SUN): # 1-2 = förgymnasial (pre-secondary) # 3-4 = gymnasial (upper secondary) # 5-6 = eftergymnasial (post-secondary, non-doctoral) # 7 = forskarutbildning (doctoral) # # Output: data/processed/attainment_ts.rds # Columns: code, year, edu_level (1-7), n # Also a summary: code, year, pct_postsec, n_25_64 library(tidyverse) library(httr) library(jsonlite) SCB_URL <- "https://api.scb.se/OV0104/v1/doris/sv/ssd/START/UF/UF0506/UF0506B/Utbildning" YEARS <- c("2000", "2005", "2010", "2015", "2022") LEVELS <- as.list(as.character(1:7)) AGES <- as.list(as.character(25:64)) GENDERS <- list("1", "2") # Fetch municipality codes from table metadata meta <- fromJSON(content(GET(SCB_URL), "text", encoding = "UTF-8")) munis <- Filter(\(x) nchar(x) == 4, meta$variables$values[[1]]) cat("Municipalities to fetch:", length(munis), "\n") # Batch fetch: 50 municipalities per request fetch_batch <- function(muni_batch) { query <- list( query = list( list(code = "Region", selection = list(filter = "item", values = as.list(muni_batch))), list(code = "Alder", selection = list(filter = "item", values = AGES)), list(code = "UtbildningsNiva", selection = list(filter = "item", values = LEVELS)), list(code = "Kon", selection = list(filter = "item", values = GENDERS)), list(code = "Tid", selection = list(filter = "item", values = as.list(YEARS))) ), response = list(format = "json") ) resp <- POST( SCB_URL, body = toJSON(query, auto_unbox = TRUE), encode = "raw", content_type("application/json"), timeout(60) ) if (status_code(resp) != 200) { stop("HTTP ", status_code(resp), " for batch starting at ", muni_batch[1]) } d <- fromJSON(content(resp, "text", encoding = "UTF-8"))$data # key is a list of character vectors: [region, age, edu_level, gender, year] keys <- do.call(rbind, d$key) tibble( code = keys[, 1], age = as.integer(keys[, 2]), edu_level = as.integer(keys[, 3]), gender = as.integer(keys[, 4]), year = as.integer(keys[, 5]), n = as.integer(unlist(d$values)) ) } batches <- split(munis, ceiling(seq_along(munis) / 50)) raw_list <- vector("list", length(batches)) for (i in seq_along(batches)) { cat(" Fetching batch", i, "/", length(batches), "(munis", batches[[i]][1], "–", tail(batches[[i]], 1), ")\n") raw_list[[i]] <- tryCatch( fetch_batch(batches[[i]]), error = function(e) { message(" Batch ", i, " failed: ", conditionMessage(e)) NULL } ) Sys.sleep(0.3) } raw <- bind_rows(compact(raw_list)) cat("Total rows fetched:", nrow(raw), "\n") # Aggregate: sum across ages and genders → n per (code, year, edu_level) attainment <- raw |> group_by(code, year, edu_level) |> summarise(n = sum(n, na.rm = TRUE), .groups = "drop") # Summary: % with post-secondary education among 25-64 year olds attainment_summary <- attainment |> group_by(code, year) |> summarise( n_total = sum(n), n_postsec = sum(n[edu_level >= 5]), pct_postsec = 100 * n_postsec / n_total, .groups = "drop" ) write_rds(attainment, "data/processed/attainment_ts.rds") write_rds(attainment_summary, "data/processed/attainment_summary.rds") cat("Saved attainment_ts.rds and attainment_summary.rds\n") cat("Years:", paste(sort(unique(attainment$year)), collapse = ", "), "\n") cat("Municipalities:", n_distinct(attainment$code), "\n")