2026-05-08 10:09:14 +02:00
|
|
|
# 00 - Libraries ---------------------------------------------------------------
|
2026-05-07 14:32:28 +02:00
|
|
|
library(tidyverse)
|
|
|
|
|
library(readxl)
|
|
|
|
|
|
2026-05-08 10:09:14 +02:00
|
|
|
# 01 - Import ------------------------------------------------------------------
|
2026-05-07 14:32:28 +02:00
|
|
|
|
|
|
|
|
# Municipalities_db.xlsx has two title rows before the merged-cell header:
|
|
|
|
|
# Row 1: "Municipalities Database" (title)
|
|
|
|
|
# Row 2: blank
|
|
|
|
|
# Row 3: category names (forward-filled across sub-columns)
|
|
|
|
|
# Row 4: sub-category names
|
|
|
|
|
# Row 5+: data
|
|
|
|
|
|
2026-05-07 15:02:25 +02:00
|
|
|
# I first extract the categories and subcategories
|
2026-05-07 14:32:28 +02:00
|
|
|
header_rows <- read_xlsx(
|
|
|
|
|
"data/Municipalities_db.xlsx",
|
|
|
|
|
col_names = FALSE,
|
|
|
|
|
n_max = 2,
|
|
|
|
|
skip = 2
|
|
|
|
|
)
|
|
|
|
|
|
2026-05-07 15:02:25 +02:00
|
|
|
# Then, I reshape it as a list using a "category_subcategory" format
|
2026-05-07 14:32:28 +02:00
|
|
|
col_names <- header_rows |>
|
|
|
|
|
t() |>
|
|
|
|
|
as_tibble(.name_repair = "unique") |>
|
|
|
|
|
set_names(c("category", "subcategory")) |>
|
|
|
|
|
fill(category) |>
|
|
|
|
|
mutate(
|
|
|
|
|
col_name = if_else(
|
|
|
|
|
is.na(subcategory),
|
|
|
|
|
category,
|
|
|
|
|
str_c(category, subcategory, sep = " - ")
|
|
|
|
|
) |>
|
|
|
|
|
str_to_lower() |>
|
|
|
|
|
str_replace_all("[^a-z0-9]+", "_") |>
|
|
|
|
|
str_remove("_$")
|
|
|
|
|
) |>
|
|
|
|
|
pull(col_name)
|
|
|
|
|
|
2026-05-07 15:02:25 +02:00
|
|
|
# Finally, I read the table skiping the first for rows and then append col_names
|
2026-05-07 14:32:28 +02:00
|
|
|
municipalities_raw <- read_xlsx(
|
|
|
|
|
"data/Municipalities_db.xlsx",
|
|
|
|
|
skip = 4,
|
|
|
|
|
col_names = col_names,
|
|
|
|
|
col_types = "text"
|
|
|
|
|
) |>
|
|
|
|
|
mutate(across(-c(code, municipality), \(x) suppressWarnings(as.numeric(x))))
|
|
|
|
|
|
2026-05-07 15:02:25 +02:00
|
|
|
# Write out as rds for future reading
|
2026-05-07 14:32:28 +02:00
|
|
|
write_rds(municipalities_raw, "data/processed/m_raw.rds")
|