clean up
This commit is contained in:
parent
1631adfb02
commit
eb093605b3
2 changed files with 79 additions and 64 deletions
|
|
@ -42,7 +42,7 @@ library(FactoMineR)
|
|||
|
||||
# 00-Load ----------------------------------------------------------------------
|
||||
m_sample <- read_rds("data/processed/m_sample.rds")
|
||||
all_vars <- setdiff(names(m_sample), c("code", "municipality"))
|
||||
all_vars <- setdiff(names(m_sample), c("code", "municipality"))
|
||||
|
||||
# 01-Exclude genuinely uninformative variables ---------------------------------
|
||||
# Removed from every part of the analysis: redundant totals, geographic areas,
|
||||
|
|
@ -57,7 +57,9 @@ truly_exclude <- c(
|
|||
"land_area_ha",
|
||||
all_vars[str_detect(all_vars, "^type_of_land_")],
|
||||
all_vars[str_detect(all_vars, "^use_of_land_")],
|
||||
"forest", "open_land", "total_green_space",
|
||||
"forest",
|
||||
"open_land",
|
||||
"total_green_space",
|
||||
all_vars[str_detect(all_vars, "^inland_water")],
|
||||
all_vars[str_detect(all_vars, "^seawater")],
|
||||
all_vars[str_detect(all_vars, "^the_four_large")],
|
||||
|
|
@ -90,23 +92,34 @@ col_sup_politics <- analysis_vars[
|
|||
# rather than resident persons. Projecting them as supplementary shows how
|
||||
# they relate to the population-composition space without distorting it.
|
||||
col_sup_infra <- analysis_vars[
|
||||
str_detect(analysis_vars,
|
||||
"^number_of_(rented|tenant_owned|owner_occupied)_dwellings") |
|
||||
str_detect(
|
||||
analysis_vars,
|
||||
"^number_of_(rented|tenant_owned|owner_occupied)_dwellings"
|
||||
) |
|
||||
str_detect(analysis_vars, "^number_of_registered_passenger_cars_") |
|
||||
str_detect(analysis_vars, "^workplaces_") |
|
||||
str_detect(analysis_vars, "^agricultural_enterprises_") |
|
||||
str_detect(analysis_vars, "^livestock_") |
|
||||
analysis_vars %in% c(
|
||||
"sex_men", "sex_women",
|
||||
"employment_by_gender_men", "employment_by_gender_women",
|
||||
"number_of_inmigrations", "number_of_outmigrations",
|
||||
"births", "deaths", "marriages", "divorces",
|
||||
"buildings", "buildings_for_seasonal_use",
|
||||
"concentrations_of_holiday_homes", "holiday_home_areas",
|
||||
"social_assistance_number_of_receiver_households",
|
||||
"urban_residences_proximity_to_public_green_areas_500_meters_or_less",
|
||||
"number_of_localities"
|
||||
)
|
||||
analysis_vars %in%
|
||||
c(
|
||||
"sex_men",
|
||||
"sex_women",
|
||||
"employment_by_gender_men",
|
||||
"employment_by_gender_women",
|
||||
"number_of_inmigrations",
|
||||
"number_of_outmigrations",
|
||||
"births",
|
||||
"deaths",
|
||||
"marriages",
|
||||
"divorces",
|
||||
"buildings",
|
||||
"buildings_for_seasonal_use",
|
||||
"concentrations_of_holiday_homes",
|
||||
"holiday_home_areas",
|
||||
"social_assistance_number_of_receiver_households",
|
||||
"urban_residences_proximity_to_public_green_areas_500_meters_or_less",
|
||||
"number_of_localities"
|
||||
)
|
||||
]
|
||||
|
||||
col_sup_vars <- c(col_sup_edu, col_sup_politics, col_sup_infra)
|
||||
|
|
@ -119,24 +132,16 @@ col_sup_vars <- c(col_sup_edu, col_sup_politics, col_sup_infra)
|
|||
# compositional profiles.
|
||||
active_vars <- analysis_vars[
|
||||
(str_detect(analysis_vars, "^age_") |
|
||||
str_detect(analysis_vars, "^education_level_of_swedish_men_") |
|
||||
str_detect(analysis_vars, "^education_level_of_swedish_women_") |
|
||||
str_detect(analysis_vars, "^employment_by_activity_sectors_") |
|
||||
str_detect(analysis_vars, "^birth_country_")) &
|
||||
str_detect(analysis_vars, "^education_level_of_swedish_men_") |
|
||||
str_detect(analysis_vars, "^education_level_of_swedish_women_") |
|
||||
str_detect(analysis_vars, "^employment_by_activity_sectors_") |
|
||||
str_detect(analysis_vars, "^birth_country_")) &
|
||||
!analysis_vars %in% col_sup_vars
|
||||
]
|
||||
|
||||
# Everything else → post-hoc correlations with CA dimensions
|
||||
outside_ca <- setdiff(analysis_vars, c(active_vars, col_sup_vars))
|
||||
|
||||
cat(
|
||||
"Active (person-count population composition): ", length(active_vars), "\n",
|
||||
"col.sup – educational provision: ", length(col_sup_edu), "\n",
|
||||
"col.sup – political vote counts: ", length(col_sup_politics), "\n",
|
||||
"col.sup – infrastructure / event counts: ", length(col_sup_infra), "\n",
|
||||
"Outside CA (rates / continuous / other): ", length(outside_ca), "\n"
|
||||
)
|
||||
|
||||
# 03-Build CA matrix -----------------------------------------------------------
|
||||
X <- m_sample |>
|
||||
select(all_of(c(active_vars, col_sup_vars))) |>
|
||||
|
|
@ -154,9 +159,6 @@ idx_sup <- seq(length(active_vars) + 1L, ncol(X))
|
|||
# 04-Run CA --------------------------------------------------------------------
|
||||
ca <- CA(X, ncp = 10, col.sup = idx_sup, graph = FALSE)
|
||||
|
||||
cat("\nEigenvalues (first 10 dimensions):\n")
|
||||
print(round(ca$eig[1:10, ], 3))
|
||||
|
||||
contribs <- ca$col$contrib |>
|
||||
as.data.frame() |>
|
||||
rownames_to_column("variable")
|
||||
|
|
@ -177,7 +179,11 @@ outside_data <- m_sample |>
|
|||
replace_na(x, if (is.finite(m)) m else 0)
|
||||
}))
|
||||
|
||||
posthoc_cor <- cor(ca_row_coords, outside_data, use = "pairwise.complete.obs") |>
|
||||
posthoc_cor <- cor(
|
||||
ca_row_coords,
|
||||
outside_data,
|
||||
use = "pairwise.complete.obs"
|
||||
) |>
|
||||
as.data.frame() |>
|
||||
rownames_to_column("dimension")
|
||||
|
||||
|
|
@ -185,11 +191,11 @@ posthoc_cor <- cor(ca_row_coords, outside_data, use = "pairwise.complete.obs") |
|
|||
write_rds(ca, "data/processed/ca_exploratory.rds")
|
||||
write_rds(
|
||||
list(
|
||||
active = active_vars,
|
||||
edu = col_sup_edu,
|
||||
active = active_vars,
|
||||
edu = col_sup_edu,
|
||||
politics = col_sup_politics,
|
||||
infra = col_sup_infra,
|
||||
outside = outside_ca
|
||||
infra = col_sup_infra,
|
||||
outside = outside_ca
|
||||
),
|
||||
"data/processed/ca_var_groups.rds"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -26,13 +26,13 @@ library(jsonlite)
|
|||
|
||||
SCB_URL <- "https://api.scb.se/OV0104/v1/doris/sv/ssd/START/UF/UF0506/UF0506B/Utbildning"
|
||||
|
||||
YEARS <- c("2000", "2005", "2010", "2015", "2022")
|
||||
YEARS <- c("2000", "2005", "2010", "2015", "2022")
|
||||
LEVELS <- as.list(as.character(1:7))
|
||||
AGES <- as.list(as.character(25:64))
|
||||
AGES <- as.list(as.character(25:64))
|
||||
GENDERS <- list("1", "2")
|
||||
|
||||
# Fetch municipality codes from table metadata
|
||||
meta <- fromJSON(content(GET(SCB_URL), "text", encoding = "UTF-8"))
|
||||
meta <- fromJSON(content(GET(SCB_URL), "text", encoding = "UTF-8"))
|
||||
munis <- Filter(\(x) nchar(x) == 4, meta$variables$values[[1]])
|
||||
cat("Municipalities to fetch:", length(munis), "\n")
|
||||
|
||||
|
|
@ -40,23 +40,27 @@ cat("Municipalities to fetch:", length(munis), "\n")
|
|||
fetch_batch <- function(muni_batch) {
|
||||
query <- list(
|
||||
query = list(
|
||||
list(code = "Region",
|
||||
selection = list(filter = "item", values = as.list(muni_batch))),
|
||||
list(code = "Alder",
|
||||
selection = list(filter = "item", values = AGES)),
|
||||
list(code = "UtbildningsNiva",
|
||||
selection = list(filter = "item", values = LEVELS)),
|
||||
list(code = "Kon",
|
||||
selection = list(filter = "item", values = GENDERS)),
|
||||
list(code = "Tid",
|
||||
selection = list(filter = "item", values = as.list(YEARS)))
|
||||
list(
|
||||
code = "Region",
|
||||
selection = list(filter = "item", values = as.list(muni_batch))
|
||||
),
|
||||
list(code = "Alder", selection = list(filter = "item", values = AGES)),
|
||||
list(
|
||||
code = "UtbildningsNiva",
|
||||
selection = list(filter = "item", values = LEVELS)
|
||||
),
|
||||
list(code = "Kon", selection = list(filter = "item", values = GENDERS)),
|
||||
list(
|
||||
code = "Tid",
|
||||
selection = list(filter = "item", values = as.list(YEARS))
|
||||
)
|
||||
),
|
||||
response = list(format = "json")
|
||||
)
|
||||
resp <- POST(
|
||||
SCB_URL,
|
||||
body = toJSON(query, auto_unbox = TRUE),
|
||||
encode = "raw",
|
||||
body = toJSON(query, auto_unbox = TRUE),
|
||||
encode = "raw",
|
||||
content_type("application/json"),
|
||||
timeout(60)
|
||||
)
|
||||
|
|
@ -67,21 +71,30 @@ fetch_batch <- function(muni_batch) {
|
|||
# key is a list of character vectors: [region, age, edu_level, gender, year]
|
||||
keys <- do.call(rbind, d$key)
|
||||
tibble(
|
||||
code = keys[, 1],
|
||||
age = as.integer(keys[, 2]),
|
||||
code = keys[, 1],
|
||||
age = as.integer(keys[, 2]),
|
||||
edu_level = as.integer(keys[, 3]),
|
||||
gender = as.integer(keys[, 4]),
|
||||
year = as.integer(keys[, 5]),
|
||||
n = as.integer(unlist(d$values))
|
||||
gender = as.integer(keys[, 4]),
|
||||
year = as.integer(keys[, 5]),
|
||||
n = as.integer(unlist(d$values))
|
||||
)
|
||||
}
|
||||
|
||||
batches <- split(munis, ceiling(seq_along(munis) / 50))
|
||||
batches <- split(munis, ceiling(seq_along(munis) / 50))
|
||||
raw_list <- vector("list", length(batches))
|
||||
|
||||
for (i in seq_along(batches)) {
|
||||
cat(" Fetching batch", i, "/", length(batches),
|
||||
"(munis", batches[[i]][1], "–", tail(batches[[i]], 1), ")\n")
|
||||
cat(
|
||||
" Fetching batch",
|
||||
i,
|
||||
"/",
|
||||
length(batches),
|
||||
"(munis",
|
||||
batches[[i]][1],
|
||||
"–",
|
||||
tail(batches[[i]], 1),
|
||||
")\n"
|
||||
)
|
||||
raw_list[[i]] <- tryCatch(
|
||||
fetch_batch(batches[[i]]),
|
||||
error = function(e) {
|
||||
|
|
@ -93,7 +106,6 @@ for (i in seq_along(batches)) {
|
|||
}
|
||||
|
||||
raw <- bind_rows(compact(raw_list))
|
||||
cat("Total rows fetched:", nrow(raw), "\n")
|
||||
|
||||
# Aggregate: sum across ages and genders → n per (code, year, edu_level)
|
||||
attainment <- raw |>
|
||||
|
|
@ -104,14 +116,11 @@ attainment <- raw |>
|
|||
attainment_summary <- attainment |>
|
||||
group_by(code, year) |>
|
||||
summarise(
|
||||
n_total = sum(n),
|
||||
n_total = sum(n),
|
||||
n_postsec = sum(n[edu_level >= 5]),
|
||||
pct_postsec = 100 * n_postsec / n_total,
|
||||
.groups = "drop"
|
||||
)
|
||||
|
||||
write_rds(attainment, "data/processed/attainment_ts.rds")
|
||||
write_rds(attainment, "data/processed/attainment_ts.rds")
|
||||
write_rds(attainment_summary, "data/processed/attainment_summary.rds")
|
||||
cat("Saved attainment_ts.rds and attainment_summary.rds\n")
|
||||
cat("Years:", paste(sort(unique(attainment$year)), collapse = ", "), "\n")
|
||||
cat("Municipalities:", n_distinct(attainment$code), "\n")
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue