clean up

2026-05-08 10:12:33 +02:00 · 2026-05-08 10:12:33 +02:00 · eb093605b3
commit eb093605b3
parent 1631adfb02
2 changed files with 79 additions and 64 deletions
--- a/src/municipalities/02-CA.R
+++ b/src/municipalities/02-CA.R
@ -42,7 +42,7 @@ library(FactoMineR)

 # 00-Load ----------------------------------------------------------------------
 m_sample <- read_rds("data/processed/m_sample.rds")
-all_vars  <- setdiff(names(m_sample), c("code", "municipality"))
+all_vars <- setdiff(names(m_sample), c("code", "municipality"))

 # 01-Exclude genuinely uninformative variables ---------------------------------
 # Removed from every part of the analysis: redundant totals, geographic areas,
@ -57,7 +57,9 @@ truly_exclude <- c(
  "land_area_ha",
  all_vars[str_detect(all_vars, "^type_of_land_")],
  all_vars[str_detect(all_vars, "^use_of_land_")],
-  "forest", "open_land", "total_green_space",
+  "forest",
+  "open_land",
+  "total_green_space",
  all_vars[str_detect(all_vars, "^inland_water")],
  all_vars[str_detect(all_vars, "^seawater")],
  all_vars[str_detect(all_vars, "^the_four_large")],
@ -90,23 +92,34 @@ col_sup_politics <- analysis_vars[
 # rather than resident persons. Projecting them as supplementary shows how
 # they relate to the population-composition space without distorting it.
 col_sup_infra <- analysis_vars[
-  str_detect(analysis_vars,
-    "^number_of_(rented|tenant_owned|owner_occupied)_dwellings") |
+  str_detect(
+    analysis_vars,
+    "^number_of_(rented|tenant_owned|owner_occupied)_dwellings"
+  ) |
    str_detect(analysis_vars, "^number_of_registered_passenger_cars_") |
    str_detect(analysis_vars, "^workplaces_") |
    str_detect(analysis_vars, "^agricultural_enterprises_") |
    str_detect(analysis_vars, "^livestock_") |
-    analysis_vars %in% c(
-      "sex_men", "sex_women",
-      "employment_by_gender_men", "employment_by_gender_women",
-      "number_of_inmigrations", "number_of_outmigrations",
-      "births", "deaths", "marriages", "divorces",
-      "buildings", "buildings_for_seasonal_use",
-      "concentrations_of_holiday_homes", "holiday_home_areas",
-      "social_assistance_number_of_receiver_households",
-      "urban_residences_proximity_to_public_green_areas_500_meters_or_less",
-      "number_of_localities"
-    )
+    analysis_vars %in%
+      c(
+        "sex_men",
+        "sex_women",
+        "employment_by_gender_men",
+        "employment_by_gender_women",
+        "number_of_inmigrations",
+        "number_of_outmigrations",
+        "births",
+        "deaths",
+        "marriages",
+        "divorces",
+        "buildings",
+        "buildings_for_seasonal_use",
+        "concentrations_of_holiday_homes",
+        "holiday_home_areas",
+        "social_assistance_number_of_receiver_households",
+        "urban_residences_proximity_to_public_green_areas_500_meters_or_less",
+        "number_of_localities"
+      )
 ]

 col_sup_vars <- c(col_sup_edu, col_sup_politics, col_sup_infra)
@ -119,24 +132,16 @@ col_sup_vars <- c(col_sup_edu, col_sup_politics, col_sup_infra)
 # compositional profiles.
 active_vars <- analysis_vars[
  (str_detect(analysis_vars, "^age_") |
-     str_detect(analysis_vars, "^education_level_of_swedish_men_") |
-     str_detect(analysis_vars, "^education_level_of_swedish_women_") |
-     str_detect(analysis_vars, "^employment_by_activity_sectors_") |
-     str_detect(analysis_vars, "^birth_country_")) &
+    str_detect(analysis_vars, "^education_level_of_swedish_men_") |
+    str_detect(analysis_vars, "^education_level_of_swedish_women_") |
+    str_detect(analysis_vars, "^employment_by_activity_sectors_") |
+    str_detect(analysis_vars, "^birth_country_")) &
    !analysis_vars %in% col_sup_vars
 ]

 # Everything else → post-hoc correlations with CA dimensions
 outside_ca <- setdiff(analysis_vars, c(active_vars, col_sup_vars))

-cat(
-  "Active (person-count population composition): ", length(active_vars), "\n",
-  "col.sup – educational provision:             ", length(col_sup_edu), "\n",
-  "col.sup – political vote counts:             ", length(col_sup_politics), "\n",
-  "col.sup – infrastructure / event counts:     ", length(col_sup_infra), "\n",
-  "Outside CA (rates / continuous / other):     ", length(outside_ca), "\n"
-)
-
 # 03-Build CA matrix -----------------------------------------------------------
 X <- m_sample |>
  select(all_of(c(active_vars, col_sup_vars))) |>
@ -154,9 +159,6 @@ idx_sup <- seq(length(active_vars) + 1L, ncol(X))
 # 04-Run CA --------------------------------------------------------------------
 ca <- CA(X, ncp = 10, col.sup = idx_sup, graph = FALSE)

-cat("\nEigenvalues (first 10 dimensions):\n")
-print(round(ca$eig[1:10, ], 3))
-
 contribs <- ca$col$contrib |>
  as.data.frame() |>
  rownames_to_column("variable")
@ -177,7 +179,11 @@ outside_data <- m_sample |>
    replace_na(x, if (is.finite(m)) m else 0)
  }))

-posthoc_cor <- cor(ca_row_coords, outside_data, use = "pairwise.complete.obs") |>
+posthoc_cor <- cor(
+  ca_row_coords,
+  outside_data,
+  use = "pairwise.complete.obs"
+) |>
  as.data.frame() |>
  rownames_to_column("dimension")

@ -185,11 +191,11 @@ posthoc_cor <- cor(ca_row_coords, outside_data, use = "pairwise.complete.obs") |
 write_rds(ca, "data/processed/ca_exploratory.rds")
 write_rds(
  list(
-    active   = active_vars,
-    edu      = col_sup_edu,
+    active = active_vars,
+    edu = col_sup_edu,
    politics = col_sup_politics,
-    infra    = col_sup_infra,
-    outside  = outside_ca
+    infra = col_sup_infra,
+    outside = outside_ca
  ),
  "data/processed/ca_var_groups.rds"
 )
--- a/src/municipalities/03-attainment-ts.R
+++ b/src/municipalities/03-attainment-ts.R
@ -26,13 +26,13 @@ library(jsonlite)

 SCB_URL <- "https://api.scb.se/OV0104/v1/doris/sv/ssd/START/UF/UF0506/UF0506B/Utbildning"

-YEARS  <- c("2000", "2005", "2010", "2015", "2022")
+YEARS <- c("2000", "2005", "2010", "2015", "2022")
 LEVELS <- as.list(as.character(1:7))
-AGES   <- as.list(as.character(25:64))
+AGES <- as.list(as.character(25:64))
 GENDERS <- list("1", "2")

 # Fetch municipality codes from table metadata
-meta  <- fromJSON(content(GET(SCB_URL), "text", encoding = "UTF-8"))
+meta <- fromJSON(content(GET(SCB_URL), "text", encoding = "UTF-8"))
 munis <- Filter(\(x) nchar(x) == 4, meta$variables$values[[1]])
 cat("Municipalities to fetch:", length(munis), "\n")

@ -40,23 +40,27 @@ cat("Municipalities to fetch:", length(munis), "\n")
 fetch_batch <- function(muni_batch) {
  query <- list(
    query = list(
-      list(code = "Region",
-           selection = list(filter = "item", values = as.list(muni_batch))),
-      list(code = "Alder",
-           selection = list(filter = "item", values = AGES)),
-      list(code = "UtbildningsNiva",
-           selection = list(filter = "item", values = LEVELS)),
-      list(code = "Kon",
-           selection = list(filter = "item", values = GENDERS)),
-      list(code = "Tid",
-           selection = list(filter = "item", values = as.list(YEARS)))
+      list(
+        code = "Region",
+        selection = list(filter = "item", values = as.list(muni_batch))
+      ),
+      list(code = "Alder", selection = list(filter = "item", values = AGES)),
+      list(
+        code = "UtbildningsNiva",
+        selection = list(filter = "item", values = LEVELS)
+      ),
+      list(code = "Kon", selection = list(filter = "item", values = GENDERS)),
+      list(
+        code = "Tid",
+        selection = list(filter = "item", values = as.list(YEARS))
+      )
    ),
    response = list(format = "json")
  )
  resp <- POST(
    SCB_URL,
-    body    = toJSON(query, auto_unbox = TRUE),
-    encode  = "raw",
+    body = toJSON(query, auto_unbox = TRUE),
+    encode = "raw",
    content_type("application/json"),
    timeout(60)
  )
@ -67,21 +71,30 @@ fetch_batch <- function(muni_batch) {
  # key is a list of character vectors: [region, age, edu_level, gender, year]
  keys <- do.call(rbind, d$key)
  tibble(
-    code      = keys[, 1],
-    age       = as.integer(keys[, 2]),
+    code = keys[, 1],
+    age = as.integer(keys[, 2]),
    edu_level = as.integer(keys[, 3]),
-    gender    = as.integer(keys[, 4]),
-    year      = as.integer(keys[, 5]),
-    n         = as.integer(unlist(d$values))
+    gender = as.integer(keys[, 4]),
+    year = as.integer(keys[, 5]),
+    n = as.integer(unlist(d$values))
  )
 }

-batches  <- split(munis, ceiling(seq_along(munis) / 50))
+batches <- split(munis, ceiling(seq_along(munis) / 50))
 raw_list <- vector("list", length(batches))

 for (i in seq_along(batches)) {
-  cat("  Fetching batch", i, "/", length(batches),
-      "(munis", batches[[i]][1], "–", tail(batches[[i]], 1), ")\n")
+  cat(
+    "  Fetching batch",
+    i,
+    "/",
+    length(batches),
+    "(munis",
+    batches[[i]][1],
+    "–",
+    tail(batches[[i]], 1),
+    ")\n"
+  )
  raw_list[[i]] <- tryCatch(
    fetch_batch(batches[[i]]),
    error = function(e) {
@ -93,7 +106,6 @@ for (i in seq_along(batches)) {
 }

 raw <- bind_rows(compact(raw_list))
-cat("Total rows fetched:", nrow(raw), "\n")

 # Aggregate: sum across ages and genders → n per (code, year, edu_level)
 attainment <- raw |>
@ -104,14 +116,11 @@ attainment <- raw |>
 attainment_summary <- attainment |>
  group_by(code, year) |>
  summarise(
-    n_total   = sum(n),
+    n_total = sum(n),
    n_postsec = sum(n[edu_level >= 5]),
    pct_postsec = 100 * n_postsec / n_total,
    .groups = "drop"
  )

-write_rds(attainment,         "data/processed/attainment_ts.rds")
+write_rds(attainment, "data/processed/attainment_ts.rds")
 write_rds(attainment_summary, "data/processed/attainment_summary.rds")
-cat("Saved attainment_ts.rds and attainment_summary.rds\n")
-cat("Years:", paste(sort(unique(attainment$year)), collapse = ", "), "\n")
-cat("Municipalities:", n_distinct(attainment$code), "\n")