# =============================================================================
#  03-attainment-ts.R  ·  Educational attainment time-series from SCB PxWeb
# =============================================================================
#
#  Table: UF0506B/Utbildning
#  "Befolkning 16-74 år efter region, utbildningsnivå, ålder och kön. År 1985-2025"
#
#  We fetch ages 25-64 (standard working-age window for education attainment),
#  both genders, all 7 SUN education levels, for selected years.
#  The API imposes an undocumented cell limit (~150k per request), so we
#  batch across municipalities (50 per request).
#
#  Education levels (SUN):
#    1-2 = förgymnasial (pre-secondary)
#    3-4 = gymnasial    (upper secondary)
#    5-6 = eftergymnasial (post-secondary, non-doctoral)
#    7   = forskarutbildning (doctoral)
#
#  Output: data/processed/attainment_ts.rds
#    Columns: code, year, edu_level (1-7), n
#    Also a summary: code, year, pct_postsec, n_25_64

library(tidyverse)
library(httr)
library(jsonlite)

SCB_URL <- "https://api.scb.se/OV0104/v1/doris/sv/ssd/START/UF/UF0506/UF0506B/Utbildning"

YEARS  <- c("2000", "2005", "2010", "2015", "2022")
LEVELS <- as.list(as.character(1:7))
AGES   <- as.list(as.character(25:64))
GENDERS <- list("1", "2")

# Fetch municipality codes from table metadata
meta  <- fromJSON(content(GET(SCB_URL), "text", encoding = "UTF-8"))
munis <- Filter(\(x) nchar(x) == 4, meta$variables$values[[1]])
cat("Municipalities to fetch:", length(munis), "\n")

# Batch fetch: 50 municipalities per request
fetch_batch <- function(muni_batch) {
  query <- list(
    query = list(
      list(code = "Region",
           selection = list(filter = "item", values = as.list(muni_batch))),
      list(code = "Alder",
           selection = list(filter = "item", values = AGES)),
      list(code = "UtbildningsNiva",
           selection = list(filter = "item", values = LEVELS)),
      list(code = "Kon",
           selection = list(filter = "item", values = GENDERS)),
      list(code = "Tid",
           selection = list(filter = "item", values = as.list(YEARS)))
    ),
    response = list(format = "json")
  )
  resp <- POST(
    SCB_URL,
    body    = toJSON(query, auto_unbox = TRUE),
    encode  = "raw",
    content_type("application/json"),
    timeout(60)
  )
  if (status_code(resp) != 200) {
    stop("HTTP ", status_code(resp), " for batch starting at ", muni_batch[1])
  }
  d <- fromJSON(content(resp, "text", encoding = "UTF-8"))$data
  # key is a list of character vectors: [region, age, edu_level, gender, year]
  keys <- do.call(rbind, d$key)
  tibble(
    code      = keys[, 1],
    age       = as.integer(keys[, 2]),
    edu_level = as.integer(keys[, 3]),
    gender    = as.integer(keys[, 4]),
    year      = as.integer(keys[, 5]),
    n         = as.integer(unlist(d$values))
  )
}

batches  <- split(munis, ceiling(seq_along(munis) / 50))
raw_list <- vector("list", length(batches))

for (i in seq_along(batches)) {
  cat("  Fetching batch", i, "/", length(batches),
      "(munis", batches[[i]][1], "–", tail(batches[[i]], 1), ")\n")
  raw_list[[i]] <- tryCatch(
    fetch_batch(batches[[i]]),
    error = function(e) {
      message("  Batch ", i, " failed: ", conditionMessage(e))
      NULL
    }
  )
  Sys.sleep(0.3)
}

raw <- bind_rows(compact(raw_list))
cat("Total rows fetched:", nrow(raw), "\n")

# Aggregate: sum across ages and genders → n per (code, year, edu_level)
attainment <- raw |>
  group_by(code, year, edu_level) |>
  summarise(n = sum(n, na.rm = TRUE), .groups = "drop")

# Summary: % with post-secondary education among 25-64 year olds
attainment_summary <- attainment |>
  group_by(code, year) |>
  summarise(
    n_total   = sum(n),
    n_postsec = sum(n[edu_level >= 5]),
    pct_postsec = 100 * n_postsec / n_total,
    .groups = "drop"
  )

write_rds(attainment,         "data/processed/attainment_ts.rds")
write_rds(attainment_summary, "data/processed/attainment_summary.rds")
cat("Saved attainment_ts.rds and attainment_summary.rds\n")
cat("Years:", paste(sort(unique(attainment$year)), collapse = ", "), "\n")
cat("Municipalities:", n_distinct(attainment$code), "\n")