118 lines
4.1 KiB
R
118 lines
4.1 KiB
R
|
|
# =============================================================================
|
|||
|
|
# 03-attainment-ts.R · Educational attainment time-series from SCB PxWeb
|
|||
|
|
# =============================================================================
|
|||
|
|
#
|
|||
|
|
# Table: UF0506B/Utbildning
|
|||
|
|
# "Befolkning 16-74 år efter region, utbildningsnivå, ålder och kön. År 1985-2025"
|
|||
|
|
#
|
|||
|
|
# We fetch ages 25-64 (standard working-age window for education attainment),
|
|||
|
|
# both genders, all 7 SUN education levels, for selected years.
|
|||
|
|
# The API imposes an undocumented cell limit (~150k per request), so we
|
|||
|
|
# batch across municipalities (50 per request).
|
|||
|
|
#
|
|||
|
|
# Education levels (SUN):
|
|||
|
|
# 1-2 = förgymnasial (pre-secondary)
|
|||
|
|
# 3-4 = gymnasial (upper secondary)
|
|||
|
|
# 5-6 = eftergymnasial (post-secondary, non-doctoral)
|
|||
|
|
# 7 = forskarutbildning (doctoral)
|
|||
|
|
#
|
|||
|
|
# Output: data/processed/attainment_ts.rds
|
|||
|
|
# Columns: code, year, edu_level (1-7), n
|
|||
|
|
# Also a summary: code, year, pct_postsec, n_25_64
|
|||
|
|
|
|||
|
|
library(tidyverse)
|
|||
|
|
library(httr)
|
|||
|
|
library(jsonlite)
|
|||
|
|
|
|||
|
|
SCB_URL <- "https://api.scb.se/OV0104/v1/doris/sv/ssd/START/UF/UF0506/UF0506B/Utbildning"
|
|||
|
|
|
|||
|
|
YEARS <- c("2000", "2005", "2010", "2015", "2022")
|
|||
|
|
LEVELS <- as.list(as.character(1:7))
|
|||
|
|
AGES <- as.list(as.character(25:64))
|
|||
|
|
GENDERS <- list("1", "2")
|
|||
|
|
|
|||
|
|
# Fetch municipality codes from table metadata
|
|||
|
|
meta <- fromJSON(content(GET(SCB_URL), "text", encoding = "UTF-8"))
|
|||
|
|
munis <- Filter(\(x) nchar(x) == 4, meta$variables$values[[1]])
|
|||
|
|
cat("Municipalities to fetch:", length(munis), "\n")
|
|||
|
|
|
|||
|
|
# Batch fetch: 50 municipalities per request
|
|||
|
|
fetch_batch <- function(muni_batch) {
|
|||
|
|
query <- list(
|
|||
|
|
query = list(
|
|||
|
|
list(code = "Region",
|
|||
|
|
selection = list(filter = "item", values = as.list(muni_batch))),
|
|||
|
|
list(code = "Alder",
|
|||
|
|
selection = list(filter = "item", values = AGES)),
|
|||
|
|
list(code = "UtbildningsNiva",
|
|||
|
|
selection = list(filter = "item", values = LEVELS)),
|
|||
|
|
list(code = "Kon",
|
|||
|
|
selection = list(filter = "item", values = GENDERS)),
|
|||
|
|
list(code = "Tid",
|
|||
|
|
selection = list(filter = "item", values = as.list(YEARS)))
|
|||
|
|
),
|
|||
|
|
response = list(format = "json")
|
|||
|
|
)
|
|||
|
|
resp <- POST(
|
|||
|
|
SCB_URL,
|
|||
|
|
body = toJSON(query, auto_unbox = TRUE),
|
|||
|
|
encode = "raw",
|
|||
|
|
content_type("application/json"),
|
|||
|
|
timeout(60)
|
|||
|
|
)
|
|||
|
|
if (status_code(resp) != 200) {
|
|||
|
|
stop("HTTP ", status_code(resp), " for batch starting at ", muni_batch[1])
|
|||
|
|
}
|
|||
|
|
d <- fromJSON(content(resp, "text", encoding = "UTF-8"))$data
|
|||
|
|
# key is a list of character vectors: [region, age, edu_level, gender, year]
|
|||
|
|
keys <- do.call(rbind, d$key)
|
|||
|
|
tibble(
|
|||
|
|
code = keys[, 1],
|
|||
|
|
age = as.integer(keys[, 2]),
|
|||
|
|
edu_level = as.integer(keys[, 3]),
|
|||
|
|
gender = as.integer(keys[, 4]),
|
|||
|
|
year = as.integer(keys[, 5]),
|
|||
|
|
n = as.integer(unlist(d$values))
|
|||
|
|
)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
batches <- split(munis, ceiling(seq_along(munis) / 50))
|
|||
|
|
raw_list <- vector("list", length(batches))
|
|||
|
|
|
|||
|
|
for (i in seq_along(batches)) {
|
|||
|
|
cat(" Fetching batch", i, "/", length(batches),
|
|||
|
|
"(munis", batches[[i]][1], "–", tail(batches[[i]], 1), ")\n")
|
|||
|
|
raw_list[[i]] <- tryCatch(
|
|||
|
|
fetch_batch(batches[[i]]),
|
|||
|
|
error = function(e) {
|
|||
|
|
message(" Batch ", i, " failed: ", conditionMessage(e))
|
|||
|
|
NULL
|
|||
|
|
}
|
|||
|
|
)
|
|||
|
|
Sys.sleep(0.3)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
raw <- bind_rows(compact(raw_list))
|
|||
|
|
cat("Total rows fetched:", nrow(raw), "\n")
|
|||
|
|
|
|||
|
|
# Aggregate: sum across ages and genders → n per (code, year, edu_level)
|
|||
|
|
attainment <- raw |>
|
|||
|
|
group_by(code, year, edu_level) |>
|
|||
|
|
summarise(n = sum(n, na.rm = TRUE), .groups = "drop")
|
|||
|
|
|
|||
|
|
# Summary: % with post-secondary education among 25-64 year olds
|
|||
|
|
attainment_summary <- attainment |>
|
|||
|
|
group_by(code, year) |>
|
|||
|
|
summarise(
|
|||
|
|
n_total = sum(n),
|
|||
|
|
n_postsec = sum(n[edu_level >= 5]),
|
|||
|
|
pct_postsec = 100 * n_postsec / n_total,
|
|||
|
|
.groups = "drop"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
write_rds(attainment, "data/processed/attainment_ts.rds")
|
|||
|
|
write_rds(attainment_summary, "data/processed/attainment_summary.rds")
|
|||
|
|
cat("Saved attainment_ts.rds and attainment_summary.rds\n")
|
|||
|
|
cat("Years:", paste(sort(unique(attainment$year)), collapse = ", "), "\n")
|
|||
|
|
cat("Municipalities:", n_distinct(attainment$code), "\n")
|