diff --git a/src/municipalities/02-CA.R b/src/municipalities/02-CA.R index 30b8e0a..12abfaa 100644 --- a/src/municipalities/02-CA.R +++ b/src/municipalities/02-CA.R @@ -42,7 +42,7 @@ library(FactoMineR) # 00-Load ---------------------------------------------------------------------- m_sample <- read_rds("data/processed/m_sample.rds") -all_vars <- setdiff(names(m_sample), c("code", "municipality")) +all_vars <- setdiff(names(m_sample), c("code", "municipality")) # 01-Exclude genuinely uninformative variables --------------------------------- # Removed from every part of the analysis: redundant totals, geographic areas, @@ -57,7 +57,9 @@ truly_exclude <- c( "land_area_ha", all_vars[str_detect(all_vars, "^type_of_land_")], all_vars[str_detect(all_vars, "^use_of_land_")], - "forest", "open_land", "total_green_space", + "forest", + "open_land", + "total_green_space", all_vars[str_detect(all_vars, "^inland_water")], all_vars[str_detect(all_vars, "^seawater")], all_vars[str_detect(all_vars, "^the_four_large")], @@ -90,23 +92,34 @@ col_sup_politics <- analysis_vars[ # rather than resident persons. Projecting them as supplementary shows how # they relate to the population-composition space without distorting it. col_sup_infra <- analysis_vars[ - str_detect(analysis_vars, - "^number_of_(rented|tenant_owned|owner_occupied)_dwellings") | + str_detect( + analysis_vars, + "^number_of_(rented|tenant_owned|owner_occupied)_dwellings" + ) | str_detect(analysis_vars, "^number_of_registered_passenger_cars_") | str_detect(analysis_vars, "^workplaces_") | str_detect(analysis_vars, "^agricultural_enterprises_") | str_detect(analysis_vars, "^livestock_") | - analysis_vars %in% c( - "sex_men", "sex_women", - "employment_by_gender_men", "employment_by_gender_women", - "number_of_inmigrations", "number_of_outmigrations", - "births", "deaths", "marriages", "divorces", - "buildings", "buildings_for_seasonal_use", - "concentrations_of_holiday_homes", "holiday_home_areas", - "social_assistance_number_of_receiver_households", - "urban_residences_proximity_to_public_green_areas_500_meters_or_less", - "number_of_localities" - ) + analysis_vars %in% + c( + "sex_men", + "sex_women", + "employment_by_gender_men", + "employment_by_gender_women", + "number_of_inmigrations", + "number_of_outmigrations", + "births", + "deaths", + "marriages", + "divorces", + "buildings", + "buildings_for_seasonal_use", + "concentrations_of_holiday_homes", + "holiday_home_areas", + "social_assistance_number_of_receiver_households", + "urban_residences_proximity_to_public_green_areas_500_meters_or_less", + "number_of_localities" + ) ] col_sup_vars <- c(col_sup_edu, col_sup_politics, col_sup_infra) @@ -119,24 +132,16 @@ col_sup_vars <- c(col_sup_edu, col_sup_politics, col_sup_infra) # compositional profiles. active_vars <- analysis_vars[ (str_detect(analysis_vars, "^age_") | - str_detect(analysis_vars, "^education_level_of_swedish_men_") | - str_detect(analysis_vars, "^education_level_of_swedish_women_") | - str_detect(analysis_vars, "^employment_by_activity_sectors_") | - str_detect(analysis_vars, "^birth_country_")) & + str_detect(analysis_vars, "^education_level_of_swedish_men_") | + str_detect(analysis_vars, "^education_level_of_swedish_women_") | + str_detect(analysis_vars, "^employment_by_activity_sectors_") | + str_detect(analysis_vars, "^birth_country_")) & !analysis_vars %in% col_sup_vars ] # Everything else → post-hoc correlations with CA dimensions outside_ca <- setdiff(analysis_vars, c(active_vars, col_sup_vars)) -cat( - "Active (person-count population composition): ", length(active_vars), "\n", - "col.sup – educational provision: ", length(col_sup_edu), "\n", - "col.sup – political vote counts: ", length(col_sup_politics), "\n", - "col.sup – infrastructure / event counts: ", length(col_sup_infra), "\n", - "Outside CA (rates / continuous / other): ", length(outside_ca), "\n" -) - # 03-Build CA matrix ----------------------------------------------------------- X <- m_sample |> select(all_of(c(active_vars, col_sup_vars))) |> @@ -154,9 +159,6 @@ idx_sup <- seq(length(active_vars) + 1L, ncol(X)) # 04-Run CA -------------------------------------------------------------------- ca <- CA(X, ncp = 10, col.sup = idx_sup, graph = FALSE) -cat("\nEigenvalues (first 10 dimensions):\n") -print(round(ca$eig[1:10, ], 3)) - contribs <- ca$col$contrib |> as.data.frame() |> rownames_to_column("variable") @@ -177,7 +179,11 @@ outside_data <- m_sample |> replace_na(x, if (is.finite(m)) m else 0) })) -posthoc_cor <- cor(ca_row_coords, outside_data, use = "pairwise.complete.obs") |> +posthoc_cor <- cor( + ca_row_coords, + outside_data, + use = "pairwise.complete.obs" +) |> as.data.frame() |> rownames_to_column("dimension") @@ -185,11 +191,11 @@ posthoc_cor <- cor(ca_row_coords, outside_data, use = "pairwise.complete.obs") | write_rds(ca, "data/processed/ca_exploratory.rds") write_rds( list( - active = active_vars, - edu = col_sup_edu, + active = active_vars, + edu = col_sup_edu, politics = col_sup_politics, - infra = col_sup_infra, - outside = outside_ca + infra = col_sup_infra, + outside = outside_ca ), "data/processed/ca_var_groups.rds" ) diff --git a/src/municipalities/03-attainment-ts.R b/src/municipalities/03-attainment-ts.R index 7cc1ae6..4350626 100644 --- a/src/municipalities/03-attainment-ts.R +++ b/src/municipalities/03-attainment-ts.R @@ -26,13 +26,13 @@ library(jsonlite) SCB_URL <- "https://api.scb.se/OV0104/v1/doris/sv/ssd/START/UF/UF0506/UF0506B/Utbildning" -YEARS <- c("2000", "2005", "2010", "2015", "2022") +YEARS <- c("2000", "2005", "2010", "2015", "2022") LEVELS <- as.list(as.character(1:7)) -AGES <- as.list(as.character(25:64)) +AGES <- as.list(as.character(25:64)) GENDERS <- list("1", "2") # Fetch municipality codes from table metadata -meta <- fromJSON(content(GET(SCB_URL), "text", encoding = "UTF-8")) +meta <- fromJSON(content(GET(SCB_URL), "text", encoding = "UTF-8")) munis <- Filter(\(x) nchar(x) == 4, meta$variables$values[[1]]) cat("Municipalities to fetch:", length(munis), "\n") @@ -40,23 +40,27 @@ cat("Municipalities to fetch:", length(munis), "\n") fetch_batch <- function(muni_batch) { query <- list( query = list( - list(code = "Region", - selection = list(filter = "item", values = as.list(muni_batch))), - list(code = "Alder", - selection = list(filter = "item", values = AGES)), - list(code = "UtbildningsNiva", - selection = list(filter = "item", values = LEVELS)), - list(code = "Kon", - selection = list(filter = "item", values = GENDERS)), - list(code = "Tid", - selection = list(filter = "item", values = as.list(YEARS))) + list( + code = "Region", + selection = list(filter = "item", values = as.list(muni_batch)) + ), + list(code = "Alder", selection = list(filter = "item", values = AGES)), + list( + code = "UtbildningsNiva", + selection = list(filter = "item", values = LEVELS) + ), + list(code = "Kon", selection = list(filter = "item", values = GENDERS)), + list( + code = "Tid", + selection = list(filter = "item", values = as.list(YEARS)) + ) ), response = list(format = "json") ) resp <- POST( SCB_URL, - body = toJSON(query, auto_unbox = TRUE), - encode = "raw", + body = toJSON(query, auto_unbox = TRUE), + encode = "raw", content_type("application/json"), timeout(60) ) @@ -67,21 +71,30 @@ fetch_batch <- function(muni_batch) { # key is a list of character vectors: [region, age, edu_level, gender, year] keys <- do.call(rbind, d$key) tibble( - code = keys[, 1], - age = as.integer(keys[, 2]), + code = keys[, 1], + age = as.integer(keys[, 2]), edu_level = as.integer(keys[, 3]), - gender = as.integer(keys[, 4]), - year = as.integer(keys[, 5]), - n = as.integer(unlist(d$values)) + gender = as.integer(keys[, 4]), + year = as.integer(keys[, 5]), + n = as.integer(unlist(d$values)) ) } -batches <- split(munis, ceiling(seq_along(munis) / 50)) +batches <- split(munis, ceiling(seq_along(munis) / 50)) raw_list <- vector("list", length(batches)) for (i in seq_along(batches)) { - cat(" Fetching batch", i, "/", length(batches), - "(munis", batches[[i]][1], "–", tail(batches[[i]], 1), ")\n") + cat( + " Fetching batch", + i, + "/", + length(batches), + "(munis", + batches[[i]][1], + "–", + tail(batches[[i]], 1), + ")\n" + ) raw_list[[i]] <- tryCatch( fetch_batch(batches[[i]]), error = function(e) { @@ -93,7 +106,6 @@ for (i in seq_along(batches)) { } raw <- bind_rows(compact(raw_list)) -cat("Total rows fetched:", nrow(raw), "\n") # Aggregate: sum across ages and genders → n per (code, year, edu_level) attainment <- raw |> @@ -104,14 +116,11 @@ attainment <- raw |> attainment_summary <- attainment |> group_by(code, year) |> summarise( - n_total = sum(n), + n_total = sum(n), n_postsec = sum(n[edu_level >= 5]), pct_postsec = 100 * n_postsec / n_total, .groups = "drop" ) -write_rds(attainment, "data/processed/attainment_ts.rds") +write_rds(attainment, "data/processed/attainment_ts.rds") write_rds(attainment_summary, "data/processed/attainment_summary.rds") -cat("Saved attainment_ts.rds and attainment_summary.rds\n") -cat("Years:", paste(sort(unique(attainment$year)), collapse = ", "), "\n") -cat("Municipalities:", n_distinct(attainment$code), "\n")